From 4edd947ac822e1bc222709884cd080ca44ac26f6 Mon Sep 17 00:00:00 2001 From: xperzy Date: Mon, 10 Jan 2022 22:15:25 +0800 Subject: [PATCH 1/2] add training for classification models --- image_classification/BoTNet/augment.py | 4 +- image_classification/BoTNet/config.py | 14 - image_classification/BoTNet/datasets.py | 18 +- image_classification/BoTNet/main_multi_gpu.py | 7 +- .../BoTNet/main_single_gpu.py | 13 +- image_classification/BoTNet/run_train.sh | 2 +- .../BoTNet/run_train_multi.sh | 6 + image_classification/CSwin/augment.py | 4 +- image_classification/CSwin/config.py | 18 +- image_classification/CSwin/cswin.py | 24 +- image_classification/CSwin/datasets.py | 8 +- image_classification/CSwin/main_multi_gpu.py | 33 +- image_classification/CSwin/main_single_gpu.py | 41 +- image_classification/CaiT/.config.py.swp | Bin 16384 -> 0 bytes image_classification/CaiT/augment.py | 285 + image_classification/CaiT/cait.py | 79 +- image_classification/CaiT/config.py | 53 +- image_classification/CaiT/datasets.py | 44 +- image_classification/CaiT/losses.py | 144 + image_classification/CaiT/main_multi_gpu.py | 507 +- image_classification/CaiT/main_single_gpu.py | 287 +- image_classification/CaiT/mixup.py | 225 + image_classification/CaiT/model_ema.py | 61 + image_classification/CaiT/random_erasing.py | 118 + image_classification/ConvMLP/augment.py | 4 +- image_classification/ConvMLP/config.py | 32 +- .../ConvMLP/configs/convmlp_l.yaml | 2 +- .../ConvMLP/configs/convmlp_m.yaml | 2 +- .../ConvMLP/configs/convmlp_s.yaml | 2 +- image_classification/ConvMLP/convmlp.py | 2 +- image_classification/ConvMLP/datasets.py | 9 +- image_classification/ConvMLP/droppath.py | 54 +- image_classification/ConvMLP/losses.py | 2 + .../ConvMLP/main_multi_gpu.py | 39 +- .../ConvMLP/main_single_gpu.py | 38 +- .../ConvMLP/random_erasing.py | 12 +- image_classification/ConvMLP/transforms.py | 1 + image_classification/ConvMLP/utils.py | 240 +- image_classification/ConvMixer/augment.py | 4 +- image_classification/ConvMixer/config.py | 23 +- image_classification/ConvMixer/datasets.py | 9 +- image_classification/ConvMixer/droppath.py | 50 + image_classification/ConvMixer/losses.py | 2 + .../ConvMixer/main_multi_gpu.py | 17 +- .../ConvMixer/main_single_gpu.py | 12 +- .../ConvMixer/random_erasing.py | 12 +- image_classification/ConvMixer/transforms.py | 1 + image_classification/ConvMixer/utils.py | 48 - image_classification/CrossViT/augment.py | 285 + image_classification/CrossViT/config.py | 45 +- image_classification/CrossViT/crossvit.py | 231 +- .../CrossViT/crossvit_utils.py | 2 +- image_classification/CrossViT/datasets.py | 55 +- image_classification/CrossViT/losses.py | 144 + .../CrossViT/main_multi_gpu.py | 504 +- .../CrossViT/main_single_gpu.py | 290 +- image_classification/CrossViT/mixup.py | 225 + image_classification/CrossViT/model_ema.py | 61 + .../CrossViT/{ => port_weights}/demo.py | 0 .../CrossViT/random_erasing.py | 118 + .../CrossViT/run_eval_tiny_224.sh | 4 +- .../CrossViT/run_train_multi_tiny_224.sh | 4 +- image_classification/CrossViT/t2t.py | 104 +- image_classification/CrossViT/transforms.py | 14 + image_classification/CvT/augment.py | 5 +- image_classification/CvT/config.py | 31 +- image_classification/CvT/cvt.py | 21 +- image_classification/CvT/datasets.py | 14 +- image_classification/CvT/main_multi_gpu.py | 4 +- image_classification/CvT/run_train_multi.sh | 2 +- image_classification/CycleMLP/augment.py | 174 +- image_classification/CycleMLP/config.py | 31 +- image_classification/CycleMLP/datasets.py | 407 +- image_classification/CycleMLP/droppath.py | 99 +- .../CycleMLP/main_multi_gpu.py | 58 +- .../CycleMLP/main_single_gpu.py | 55 +- image_classification/CycleMLP/transforms.py | 1 + image_classification/CycleMLP/utils.py | 240 +- image_classification/DeiT/augment.py | 4 +- image_classification/DeiT/config.py | 48 +- .../DeiT/configs/deit_base_patch16_224.yaml | 5 +- .../DeiT/configs/deit_small_patch16_224.yaml | 5 +- .../DeiT/configs/deit_tiny_patch16_224.yaml | 1 + image_classification/DeiT/datasets.py | 18 +- image_classification/DeiT/deit.py | 66 +- image_classification/DeiT/main_single_gpu.py | 32 +- image_classification/DeiT/transforms.py | 14 + image_classification/FF_Only/augment.py | 4 +- image_classification/FF_Only/config.py | 27 +- image_classification/FF_Only/datasets.py | 10 +- image_classification/FF_Only/droppath.py | 53 +- .../FF_Only/main_multi_gpu.py | 60 +- .../FF_Only/main_single_gpu.py | 57 +- image_classification/FF_Only/transforms.py | 1 + image_classification/FF_Only/utils.py | 27 - .../Focal_Transformer/augment.py | 4 +- .../Focal_Transformer/config.py | 17 +- .../Focal_Transformer/datasets.py | 8 +- .../Focal_Transformer/main_multi_gpu.py | 42 +- .../Focal_Transformer/main_single_gpu.py | 44 +- .../Focal_Transformer/run_train_multi.sh | 4 +- image_classification/HVT/augment.py | 4 +- image_classification/HVT/config.py | 37 +- image_classification/HVT/datasets.py | 10 +- image_classification/HVT/main_multi_gpu.py | 4 +- image_classification/HVT/main_single_gpu.py | 37 +- image_classification/HaloNet/augment.py | 285 + image_classification/HaloNet/config.py | 22 +- .../HaloNet/configs/halonet_26t_256.yaml | 2 +- .../HaloNet/configs/halonet_50ts_256.yaml | 2 +- image_classification/HaloNet/datasets.py | 51 +- image_classification/HaloNet/losses.py | 2 + .../HaloNet/random_erasing.py | 118 + .../HaloNet/run_train_multi.sh | 2 +- image_classification/MAE/augment.py | 4 +- image_classification/MAE/datasets.py | 47 +- image_classification/MAE/nohup.out | 9507 +++++++++++++++++ image_classification/MLP-Mixer/augment.py | 285 + image_classification/MLP-Mixer/config.py | 40 +- image_classification/MLP-Mixer/datasets.py | 53 +- image_classification/MLP-Mixer/losses.py | 123 + .../MLP-Mixer/main_multi_gpu.py | 471 +- .../MLP-Mixer/main_single_gpu.py | 260 +- image_classification/MLP-Mixer/mixup.py | 225 + image_classification/MLP-Mixer/mlp_mixer.py | 2 +- .../MLP-Mixer/random_erasing.py | 118 + image_classification/MLP-Mixer/transforms.py | 14 + image_classification/MobileViT/augment.py | 4 +- image_classification/MobileViT/datasets.py | 6 + .../MobileViT/main_multi_gpu.py | 4 +- image_classification/PVTv2/augment.py | 4 +- image_classification/PVTv2/config.py | 20 +- .../PVTv2/configs/pvtv2_b0.yaml | 2 +- .../PVTv2/configs/pvtv2_b1.yaml | 2 +- .../PVTv2/configs/pvtv2_b2.yaml | 2 +- .../PVTv2/configs/pvtv2_b2_linear.yaml | 2 +- .../PVTv2/configs/pvtv2_b3.yaml | 2 +- .../PVTv2/configs/pvtv2_b4.yaml | 2 +- .../PVTv2/configs/pvtv2_b5.yaml | 2 +- image_classification/PVTv2/datasets.py | 10 +- image_classification/PVTv2/pvtv2.py | 28 +- image_classification/PiT/augment.py | 4 +- image_classification/PiT/config.py | 28 +- image_classification/PiT/datasets.py | 13 +- image_classification/PiT/main_multi_gpu.py | 3 +- image_classification/PiT/main_single_gpu.py | 35 +- image_classification/PoolFormer/augment.py | 4 +- image_classification/PoolFormer/config.py | 22 +- image_classification/PoolFormer/datasets.py | 16 +- .../PoolFormer/main_multi_gpu.py | 7 +- .../PoolFormer/main_single_gpu.py | 15 +- .../PoolFormer/run_train_multi.sh | 2 +- image_classification/RepMLP/augment.py | 4 +- image_classification/RepMLP/config.py | 30 +- image_classification/RepMLP/datasets.py | 9 +- image_classification/RepMLP/droppath.py | 50 + image_classification/RepMLP/main_multi_gpu.py | 58 +- .../RepMLP/main_single_gpu.py | 57 +- image_classification/RepMLP/transforms.py | 1 + image_classification/ResMLP/augment.py | 285 + image_classification/ResMLP/config.py | 40 +- image_classification/ResMLP/datasets.py | 52 +- image_classification/ResMLP/droppath.py | 1 + image_classification/ResMLP/losses.py | 123 + image_classification/ResMLP/main_multi_gpu.py | 469 +- .../ResMLP/main_single_gpu.py | 260 +- image_classification/ResMLP/mixup.py | 225 + image_classification/ResMLP/random_erasing.py | 118 + image_classification/ResMLP/resmlp.py | 2 +- image_classification/ResMLP/transforms.py | 14 + .../Shuffle_Transformer/.config.py.swp | Bin 16384 -> 0 bytes .../Shuffle_Transformer/augment.py | 4 +- .../Shuffle_Transformer/config.py | 23 +- .../Shuffle_Transformer/datasets.py | 8 +- .../Shuffle_Transformer/main_multi_gpu.py | 47 +- .../Shuffle_Transformer/main_single_gpu.py | 42 +- .../Shuffle_Transformer/run_train_multi.sh | 2 +- .../shuffle_transformer.py | 5 +- .../SwinTransformer/augment.py | 4 +- .../SwinTransformer/config.py | 23 +- .../SwinTransformer/datasets.py | 7 + .../SwinTransformer/main_multi_gpu.py | 49 +- .../SwinTransformer/main_single_gpu.py | 46 +- .../SwinTransformer/run_train_multi.sh | 6 +- .../SwinTransformer/swin_transformer.py | 24 +- image_classification/T2T_ViT/augment.py | 4 +- image_classification/T2T_ViT/config.py | 45 +- image_classification/T2T_ViT/datasets.py | 11 +- .../T2T_ViT/main_single_gpu.py | 1 + .../T2T_ViT/run_train_multi.sh | 2 +- image_classification/T2T_ViT/t2t_vit.py | 28 +- image_classification/T2T_ViT/utils.py | 3 +- image_classification/ViP/augment.py | 4 +- image_classification/ViP/config.py | 16 +- image_classification/ViP/datasets.py | 13 +- image_classification/ViP/main_multi_gpu.py | 17 +- image_classification/ViP/main_single_gpu.py | 22 +- image_classification/ViT/config.py | 31 +- .../ViT/configs/vit_base_patch16_224.yaml | 2 +- .../ViT/configs/vit_base_patch16_384.yaml | 1 - image_classification/ViT/main_multi_gpu.py | 2 +- image_classification/ViT/main_single_gpu.py | 4 +- image_classification/XCiT/augment.py | 4 +- image_classification/XCiT/config.py | 26 +- image_classification/XCiT/datasets.py | 54 +- image_classification/XCiT/main_multi_gpu.py | 11 +- image_classification/XCiT/main_single_gpu.py | 5 +- image_classification/XCiT/run_train.sh | 4 +- image_classification/XCiT/run_train_multi.sh | 6 + image_classification/gMLP/augment.py | 285 + image_classification/gMLP/config.py | 40 +- image_classification/gMLP/datasets.py | 52 +- image_classification/gMLP/gmlp.py | 2 +- image_classification/gMLP/losses.py | 123 + image_classification/gMLP/main_multi_gpu.py | 473 +- image_classification/gMLP/main_single_gpu.py | 260 +- image_classification/gMLP/mixup.py | 225 + image_classification/gMLP/random_erasing.py | 118 + image_classification/gMLP/transforms.py | 14 + 219 files changed, 19140 insertions(+), 3211 deletions(-) create mode 100644 image_classification/BoTNet/run_train_multi.sh delete mode 100644 image_classification/CaiT/.config.py.swp create mode 100644 image_classification/CaiT/augment.py create mode 100644 image_classification/CaiT/losses.py create mode 100644 image_classification/CaiT/mixup.py create mode 100644 image_classification/CaiT/model_ema.py create mode 100644 image_classification/CaiT/random_erasing.py create mode 100644 image_classification/ConvMixer/droppath.py create mode 100644 image_classification/CrossViT/augment.py create mode 100644 image_classification/CrossViT/losses.py create mode 100644 image_classification/CrossViT/mixup.py create mode 100644 image_classification/CrossViT/model_ema.py rename image_classification/CrossViT/{ => port_weights}/demo.py (100%) create mode 100644 image_classification/CrossViT/random_erasing.py create mode 100644 image_classification/CrossViT/transforms.py create mode 100644 image_classification/DeiT/transforms.py create mode 100644 image_classification/HaloNet/augment.py create mode 100644 image_classification/HaloNet/random_erasing.py create mode 100644 image_classification/MAE/nohup.out create mode 100644 image_classification/MLP-Mixer/augment.py create mode 100644 image_classification/MLP-Mixer/losses.py create mode 100644 image_classification/MLP-Mixer/mixup.py create mode 100644 image_classification/MLP-Mixer/random_erasing.py create mode 100644 image_classification/MLP-Mixer/transforms.py create mode 100644 image_classification/RepMLP/droppath.py create mode 100644 image_classification/ResMLP/augment.py create mode 100644 image_classification/ResMLP/losses.py create mode 100644 image_classification/ResMLP/mixup.py create mode 100644 image_classification/ResMLP/random_erasing.py create mode 100644 image_classification/ResMLP/transforms.py delete mode 100644 image_classification/Shuffle_Transformer/.config.py.swp create mode 100644 image_classification/XCiT/run_train_multi.sh create mode 100644 image_classification/gMLP/augment.py create mode 100644 image_classification/gMLP/losses.py create mode 100644 image_classification/gMLP/mixup.py create mode 100644 image_classification/gMLP/random_erasing.py create mode 100644 image_classification/gMLP/transforms.py diff --git a/image_classification/BoTNet/augment.py b/image_classification/BoTNet/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/BoTNet/augment.py +++ b/image_classification/BoTNet/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/BoTNet/config.py b/image_classification/BoTNet/config.py index ffd7112f..6ac2f51a 100644 --- a/image_classification/BoTNet/config.py +++ b/image_classification/BoTNet/config.py @@ -89,20 +89,6 @@ _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" diff --git a/image_classification/BoTNet/datasets.py b/image_classification/BoTNet/datasets.py index e0285653..cc377c90 100644 --- a/image_classification/BoTNet/datasets.py +++ b/image_classification/BoTNet/datasets.py @@ -20,12 +20,19 @@ import os import math from PIL import Image -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from random_erasing import RandomErasing + class ImageNet2012Dataset(Dataset): """Build ImageNet2012 dataset @@ -93,9 +100,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, @@ -147,6 +158,7 @@ def get_dataset(config, mode='train'): Returns: dataset: dataset object """ + assert mode in ['train', 'val'] if config.DATA.DATASET == "cifar10": if mode == 'train': diff --git a/image_classification/BoTNet/main_multi_gpu.py b/image_classification/BoTNet/main_multi_gpu.py index aeeac783..33a239a0 100644 --- a/image_classification/BoTNet/main_multi_gpu.py +++ b/image_classification/BoTNet/main_multi_gpu.py @@ -556,11 +556,8 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - local_logger.info(f"----- Save model: {model_path}.pdparams") - local_logger.info(f"----- Save optim: {model_path}.pdopt") - if local_rank == 0: - master_logger.info(f"----- Save model: {model_path}.pdparams") - master_logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") def main(): diff --git a/image_classification/BoTNet/main_single_gpu.py b/image_classification/BoTNet/main_single_gpu.py index b276d052..b5ec964d 100644 --- a/image_classification/BoTNet/main_single_gpu.py +++ b/image_classification/BoTNet/main_single_gpu.py @@ -266,11 +266,14 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - if config.TRAIN.LINEAR_SCALED_LR: - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 1024.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 1024.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 1024.0 + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR if config.TRAIN.ACCUM_ITER > 1: linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER diff --git a/image_classification/BoTNet/run_train.sh b/image_classification/BoTNet/run_train.sh index 6fd11c04..3fad1e39 100644 --- a/image_classification/BoTNet/run_train.sh +++ b/image_classification/BoTNet/run_train.sh @@ -1,6 +1,6 @@ CUDA_VISIBLE_DEVICES=0 \ python main_single_gpu.py \ --cfg='./configs/botnet50_224.yaml' \ +-cfg='./configs/botnet50.yaml' \ -dataset='imagenet2012' \ -batch_size=16 \ -data_path='/dataset/imagenet' \ diff --git a/image_classification/BoTNet/run_train_multi.sh b/image_classification/BoTNet/run_train_multi.sh new file mode 100644 index 00000000..058c70b0 --- /dev/null +++ b/image_classification/BoTNet/run_train_multi.sh @@ -0,0 +1,6 @@ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +python main_multi_gpu.py \ +-cfg='./configs/botnet50.yaml' \ +-dataset='imagenet2012' \ +-batch_size=16 \ +-data_path='/dataset/imagenet' \ diff --git a/image_classification/CSwin/augment.py b/image_classification/CSwin/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/CSwin/augment.py +++ b/image_classification/CSwin/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/CSwin/config.py b/image_classification/CSwin/config.py index 69fa0d84..e959c57e 100644 --- a/image_classification/CSwin/config.py +++ b/image_classification/CSwin/config.py @@ -74,6 +74,7 @@ _C.TRAIN.ACCUM_ITER = 2 _C.TRAIN.MODEL_EMA = True _C.TRAIN.MODEL_EMA_DECAY = 0.99992 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -97,26 +98,14 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" @@ -144,6 +133,7 @@ def _update_config_from_file(config, cfg_file): config.merge_from_file(cfg_file) config.freeze() + def update_config(config, args): """Update config by ArgumentParser Args: diff --git a/image_classification/CSwin/cswin.py b/image_classification/CSwin/cswin.py index 0dbcbf04..3ac93c1a 100644 --- a/image_classification/CSwin/cswin.py +++ b/image_classification/CSwin/cswin.py @@ -62,13 +62,13 @@ def __init__(self, patch_stride=4, in_channels=3, embed_dim=96): bias_attr=b_attr) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def forward(self, x): @@ -110,7 +110,7 @@ def __init__(self, in_features, hidden_features, dropout): def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def forward(self, x): @@ -337,13 +337,13 @@ def __init__(self, dropout=dropout) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def chunk_qkv(self, x, chunks=1, axis=-1): @@ -393,8 +393,8 @@ def __init__(self, dim_in, dim_out): bias_attr=b_attr_1) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def forward(self, x): @@ -543,13 +543,13 @@ def __init__(self, bias_attr=b_attr_2) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.)) return weight_attr, bias_attr def forward_features(self, x): diff --git a/image_classification/CSwin/datasets.py b/image_classification/CSwin/datasets.py index ed6a8450..7e178b57 100644 --- a/image_classification/CSwin/datasets.py +++ b/image_classification/CSwin/datasets.py @@ -28,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -100,9 +102,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/CSwin/main_multi_gpu.py b/image_classification/CSwin/main_multi_gpu.py index ec6e7554..149c72c0 100644 --- a/image_classification/CSwin/main_multi_gpu.py +++ b/image_classification/CSwin/main_multi_gpu.py @@ -35,7 +35,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from model_ema import ModelEma from cswin import build_cswin as build_model @@ -369,18 +368,22 @@ def main_worker(*args): # STEP 5: Define optimizer and lr_scheduler # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / 256.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / 256.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / 256.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -454,9 +457,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/CSwin/main_single_gpu.py b/image_classification/CSwin/main_single_gpu.py index 2bbfabcf..772a9ffb 100644 --- a/image_classification/CSwin/main_single_gpu.py +++ b/image_classification/CSwin/main_single_gpu.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Swin training/validation using single GPU """ +"""CSwin training/validation using single GPU """ import sys import os @@ -34,7 +34,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from model_ema import ModelEma from cswin import build_cswin as build_model @@ -129,7 +128,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -279,19 +278,23 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -357,9 +360,9 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/CaiT/.config.py.swp b/image_classification/CaiT/.config.py.swp deleted file mode 100644 index ed536a52914ea91eb91f9d21fc2b4f57f1cfe712..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHOTZklA8LrJm-RNrEjpRYWlb(g1!S0%#p54vNm<`l+*X&fLZ|TbnYZ8i@>eJoj zR#(;5WhS#m#RtI~sQ4ff1zk}T)>jpYB6vXqLPUK~6fr(1_@YrnM8WSrReha~EAd67 z3cjhXbI$*t|9t;{PMu55L19m;vWMjh60Z9t=}TXK!1%(c=kB?GTatX&4Jr9f*YSCtG6pQQmLZMKUsfe_C{;zqYiRVjZrVl=SUt-$Ld$hlO9paG*Xdb(**m_IA!&ZfK+3>k2BNT+-#8aNWdIRkGGcEXW z{y$p(e-DY@0p15Z3OoRu1%C5-N%}SLDWC(~2YmO8Bz+#Z1$-R%7|;aH0cU|X0%w4` zfnU8&l70-l0DK?#8t_%%i@+y=ec(ahm#>wi+rX2+-|my7Zvo!~ZUdhN-VZpyKJW;@ zfYZS5PfOD8fER#o0M7!?08arW;H`iJ{Ps1l8@L604)`qa5#Sl13p@s#27dZ#N%{of z0QJHS=oLEtXn7ubCH3~(KI0=NkL5gRry0$%|>0E9pQG=aAP4*}%Umqr^olh|X) zW%&JIiz$9T8geIW7=FNghHouG!*T-Vdh`KCAW>nI3AV(qOeWLux<<&UAU?`q2gh?7 zq%QBVNR-jzGRyHT$Ed=UE%*vooF?~t&TsPNmFcu0_wgY}&v%D<&$9Wl+3SlRaUsvQ zoNzfyg)FB(GJ{I9cfC6+_gsHygcF%qh-##3_K8wPwz+}q8R0;Fr)xROqld2LJH~Lz zf*gY}vr6=twoaKXk$SqU9!bUg!903kmcx#Xp{?V$9PkjF8Ig@s58S^@W#sM1w&Um6$Rm7U15a{iOh}Gp{p<=e0+>MZ zaV3d2V21AoA>2VueA$+bID_%AW8hOsFUhs5jZO_~j-!(7Kcrcy0cEsrE1pp})js&6;d$B~HdiXe!fxxqr0bzKH= z3*m@JCDEqZs5je-3)^mA1Pq$(ifWX-vD%AMN(18%!9DOoD%4i4qSj2}p3F#&(&SY) z3cV=INK0&J1*Tx2-Bh%iTyM0ss`d`GDOc-Nwbt%bsXg+h&^TA(irQ9)vt{JxsL;B3r@;aey5=IW6szueV?zDB`P?&&{i1dQI9!m*D3xs0RCZ1zU>lYyL9`wHJ zEP(R^A$g&&d5Ikup*diIb#toL8c3Q*7pBOlD6Mv!Mb?b%#Dj>(B_`TL!#c*q5O+Fd zKsT(`%W6dip|zH*)wH&baiJq30<2w}FY);9o_<9Wy{hd;!mw!tWZsUdlqsNuN2W8Y zl?Fw=R;SF^7t`5gwb9J_!Dm+`$Nf@JfApjOjM zJ4&slRz!!H*V|Cqr5(M6afQS$&1EBP!l4t;YY=bq35{bGm3A9Ll2)(j*W5S+W3i^%>WJZJHhI(!>HAL6{}Pt3 zHN?<0n>=aT(Cu=2wpDmNC^8whUvKWJO`;B6lR44w00g)_hS!6^=#5En6%KrEbOT1V z3&|2l&>N+gg^h>lO<*|~)|vPml?A3@bLP_d14&B6X&jI?#Kz7^TyoKkC^zw!RUkOm|UXGyQ3)WO7zmiEdvk ztQE7+OO1uo8BJTp^;3Br;j=>$t4Wdgds~REk{r!_#49zHAB$cY2c4e0y*Jp>C29e+ zQuKbEdcQCogJ~aZVHU7Jp8RqVBb@jXcl0~ULnw)A;l-J7rqL1kG*v=%qmq&}^jtp_ z^CU^k3XcslkmD%}oGLO)y?zZ2qFGR>Hj;1y3A0-aIHh=IYZ?w5Ou zPC^1Ev!0kTFWc783K0!afDf2g67fjHj3am<+jAl$Y>1hIkegxTW9nu&-SLo@V>~ov zEG#ma*nmU>mK#jeAZ9rMiyX}M5XYQVECgP_WfW*}KAj|Lr6PgHP5PLc*@n}PU>@tc zhun9lD|p<;yd|JXJ{bZn`&xU41_ZpEFx0j+^z?d@mFl&!Mgs<(wwY49#&)$@8RlYc z2}}6Xh}Zz@1)tcZZb1fhtswNRgGlf)Z!tp`nl9ivFo0z+Vr#7ogI~2)4mvuHO^Pwu zi!7gd8%U>)XBr3af^8g8okkxk2(ai<*E{kpdTXGky1fvucrGnL4_$*W^tM)4kB*LH z10SFdxa|7w9?)Wl8CWKGc{p1@k}C= zS$TWX#u6;_t^Ocn%jOCz7K&?3d{@tuC>*$cAT!0bS(8cxtjV#~J0w@p`oDuU_9?8n zY5jj5H@g0Uwf@h59|6w+9|S@`2QC0_0sers|4)Ehz=we+0S(9kZvy^~^*-$l+y-s} zJ>U`GJn$0g`xCA6f#-oA0zUx047dR818f850AU;M>6J2&GLSNmGLSNmGLSNmGLSNm zGVotAK&vBcCPqH3Q|L%Xq7=IR^ZkE`ipR$s;y_^fPyq{b?6sISc7>8-9;|SiafMh{ zQ93QGk_9?ebhK52Eq=B*ZxDnbWJ2pZa!r$ zLTJ2qHQI|3(zH3E)Ao}zUF$e?v6h(~HK8W{?{TpaJ*{VwyjanMCdBC?hDJ@A5Vu&- zWKV9~D(oDcXcN+6`Gob!i5po>*ByJE(2t|H_=Im{9};54)K`Qe{D~{sN{(Z@$+O6g Id*GJzF9s#&Hvj+t diff --git a/image_classification/CaiT/augment.py b/image_classification/CaiT/augment.py new file mode 100644 index 00000000..7a7f081c --- /dev/null +++ b/image_classification/CaiT/augment.py @@ -0,0 +1,285 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Augmentation""" +""" Rand Augmentation """ +# reference: RandAugment: Practical automated data augmentation with a reduced search space +# https://arxiv.org/abs/1909.13719 + +""" Auto Augmentation """ +# reference: AutoAugment: Learning Augmentation Policies from Data +# https://arxiv.org/abs/1805.09501 + +import random +import numpy as np +from PIL import Image, ImageEnhance, ImageOps + + +def auto_augment_policy_original(): + """25 types of augment policies in original paper""" + policy = [ + [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy] + return policy + + +def rand_augment_policy_original(magnitude_idx=9): + """ + 14 types of augment policies in original paper + Args: + magnitude_idx: M + """ + policy = [ + ('Posterize', 1, magnitude_idx), ('Rotate', 1, magnitude_idx), + ('Solarize', 1, magnitude_idx), ('AutoContrast', 1, magnitude_idx), + ('Equalize', 1, magnitude_idx), ('Contrast', 1, magnitude_idx), + ('Color', 1, magnitude_idx), ('Invert', 1, magnitude_idx), + ('Sharpness', 1, magnitude_idx), ('Brightness', 1, magnitude_idx), + ('ShearX', 1, magnitude_idx), ('ShearY', 1, magnitude_idx), + ('TranslateX', 1, magnitude_idx), ('TranslateY', 1, magnitude_idx), + ] + policy = [SubPolicy(*args) for args in policy] + return policy + + +class AutoAugment(): + """Auto Augment + Randomly choose a tuple of augment ops from a list of policy + Then apply the tuple of augment ops to input image + + Examples: + policy = auto_augment_policy_original() + augment = AutoAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy): + self.policy = policy + + def __call__(self, image, policy_idx=None): + if policy_idx is None: + policy_idx = random.randint(0, len(self.policy) - 1) + + sub_policy = self.policy[policy_idx] + for op in sub_policy: + image = op(image) + return image + + +class RandAugment(): + """Rand Augment + Randomly choose N augment ops from a list of K policies + Then apply the N ops to input image + + Examples: + policy = rand_augment_policy_original(magnitude_idx) + augment = RandAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy, num_layers=2): + """ + Args: + policy: list of SubPolicy + num_layers: int + """ + self.policy = policy + self.num_layers = num_layers + + def __call__(self, image): + selected_idx = np.random.choice(len(self.policy), self.num_layers) + + for policy_idx in selected_idx: + sub_policy = self.policy[policy_idx] + image = sub_policy(image) + return image + + +class SubPolicy: + """Subpolicy + Read augment name and magnitude, apply augment with probability + Args: + op_name: str, augment operation name + prob: float, if prob > random prob, apply augment + magnitude_idx: int, index of magnitude in preset magnitude ranges + """ + + def __init__(self, op_name, prob, magnitude_idx): + # ranges of operations' magnitude + ranges = { + 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'TranslateX': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'TranslateY': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'Rotate': np.linspace(0, 30, 10), # [-30, 30] (by random negative) + 'Color': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), # [0, 4] + 'Solarize': np.linspace(256, 0, 10), # [0, 256] + 'Contrast': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Sharpness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Brightness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'AutoContrast': [0] * 10, # no range + 'Equalize': [0] * 10, # no range + 'Invert': [0] * 10, # no range + } + + # augmentation operations + # Lambda is not pickleable for DDP + # image_ops = { + # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), + # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), + # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), + # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), + # 'Rotate': lambda image, magnitude: rotate(image, magnitude), + # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), + # 'Invert': lambda image, magnitude: invert(image, magnitude), + # 'Equalize': lambda image, magnitude: equalize(image, magnitude), + # 'Solarize': lambda image, magnitude: solarize(image, magnitude), + # 'Posterize': lambda image, magnitude: posterize(image, magnitude), + # 'Contrast': lambda image, magnitude: contrast(image, magnitude), + # 'Color': lambda image, magnitude: color(image, magnitude), + # 'Brightness': lambda image, magnitude: brightness(image, magnitude), + # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), + # } + image_ops = { + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_relative, + 'TranslateY': translate_y_relative, + 'Rotate': rotate, + 'AutoContrast': auto_contrast, + 'Invert': invert, + 'Equalize': equalize, + 'Solarize': solarize, + 'Posterize': posterize, + 'Contrast': contrast, + 'Color': color, + 'Brightness': brightness, + 'Sharpness': sharpness, + } + + self.prob = prob + self.magnitude = ranges[op_name][magnitude_idx] + self.op = image_ops[op_name] + + def __call__(self, image): + if self.prob > random.random(): + image = self.op(image, self.magnitude) + return image + + +# PIL Image transforms +# https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform +def shear_x(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor) + + +def shear_y(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor) + + +def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor) + + +def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor) + + +def rotate(image, magnitude): + rot = image.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new('RGBA', rot.size, (128,) * 4), + rot).convert(image.mode) + + +def auto_contrast(image, magnitude=None): + return ImageOps.autocontrast(image) + + +def invert(image, magnitude=None): + return ImageOps.invert(image) + + +def equalize(image, magnitude=None): + return ImageOps.equalize(image) + + +def solarize(image, magnitude): + return ImageOps.solarize(image, magnitude) + + +def posterize(image, magnitude): + return ImageOps.posterize(image, magnitude) + + +def contrast(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Contrast(image).enhance(1 + magnitude) + + +def color(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Color(image).enhance(1 + magnitude) + + +def brightness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Brightness(image).enhance(1 + magnitude) + + +def sharpness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Sharpness(image).enhance(1 + magnitude) + diff --git a/image_classification/CaiT/cait.py b/image_classification/CaiT/cait.py index d8038106..a92f91ab 100644 --- a/image_classification/CaiT/cait.py +++ b/image_classification/CaiT/cait.py @@ -104,8 +104,8 @@ def __init__(self, in_features, hidden_features, dropout=0.): self.dropout = nn.Dropout(dropout) def _init_weights(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform()) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -144,15 +144,24 @@ def __init__(self, self.dim_head = dim // num_heads self.scale = qk_scale or self.dim_head ** -0.5 - self.q = nn.Linear(dim, dim, bias_attr=qkv_bias) - self.k = nn.Linear(dim, dim, bias_attr=qkv_bias) - self.v = nn.Linear(dim, dim, bias_attr=qkv_bias) + w_attr_1, b_attr_1 = self._init_weights() + self.q = nn.Linear(dim, dim, weight_attr=w_attr_1, bias_attr=b_attr_1, if qkv_bias else False) + w_attr_2, b_attr_2 = self._init_weights() + self.k = nn.Linear(dim, dim, weight_attr=w_attr_2, bias_attr=b_attr_2, if qkv_bias else False) + w_attr_3, b_attr_3 = self._init_weights() + self.v = nn.Linear(dim, dim, weight_attr=w_attr_3, bias_attr=b_attr_3, if qkv_bias else False) self.attn_dropout = nn.Dropout(attention_dropout) - self.proj = nn.Linear(dim, dim) + w_attr_4, b_attr_4 = self._init_weights() + self.proj = nn.Linear(dim, dim, weight_attr=w_attr_4, bias_attr=b_attr_4) self.proj_dropout = nn.Dropout(dropout) self.softmax = nn.Softmax(axis=-1) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): B, N, C = x.shape @@ -206,15 +215,24 @@ def __init__(self, self.dim_head = dim // num_heads self.scale = self.dim_head ** -0.5 - self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + w_attr_1, b_attr_1 = self._init_weights() + self.qkv = nn.Linear(dim, dim * 3, weight_attr=w_attr_1, bias_attr=b_attr_1 if qkv_bias else False) self.attn_dropout = nn.Dropout(attention_dropout) self.softmax = nn.Softmax(axis=-1) - self.proj = nn.Linear(dim, dim) + w_attr_2, b_attr_2 = self._init_weights() + self.proj = nn.Linear(dim, dim, weight_attr=w_attr_2, bias_attr=b_attr_2) self.proj_dropout = nn.Dropout(dropout) # talking head - self.proj_l = nn.Linear(num_heads, num_heads) - self.proj_w = nn.Linear(num_heads, num_heads) + w_attr_3, b_attr_3 = self._init_weights() + self.proj_l = nn.Linear(num_heads, num_heads, weight_attr=w_attr_3, bias_attr=b_attr_3) + w_attr_4, b_attr_4 = self._init_weights() + self.proj_w = nn.Linear(num_heads, num_heads, weight_attr=w_attr_4, bias_attr=b_attr_4) + + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr def transpose_multihead(self, x): new_shape = x.shape[:-1] + [self.num_heads, self.dim_head] @@ -280,14 +298,16 @@ def __init__(self, droppath=0., init_values=1e-4): super().__init__() - self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) + w_attr_1, b_attr_1 = self._init_weights() + self.norm1 = nn.LayerNorm(dim, weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6) self.attn = ClassAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, dropout=dropout, attention_dropout=attention_dropout) self.drop_path = DropPath(droppath) if droppath > 0. else Identity() - self.norm2 = nn.LayerNorm(dim, epsilon=1e-6) + w_attr_2, b_attr_2 = self._init_weights() + self.norm2 = nn.LayerNorm(dim, weight_attr=w_attr_2, bias_attr=b_attr_2, epsilon=1e-6) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), dropout=dropout) @@ -301,6 +321,11 @@ def __init__(self, dtype='float32', default_initializer=nn.initializer.Constant(init_values)) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x, x_cls): u = paddle.concat([x_cls, x], axis=1) @@ -346,14 +371,16 @@ def __init__(self, droppath=0., init_values=1e-4): super().__init__() - self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) + w_attr_1, b_attr_1 = self._init_weights() + self.norm1 = nn.LayerNorm(dim, weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6) self.attn = TalkingHeadAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, dropout=dropout, attention_dropout=attention_dropout) self.drop_path = DropPath(droppath) if droppath > 0. else Identity() - self.norm2 = nn.LayerNorm(dim, epsilon=1e-6) + w_attr_2, b_attr_2 = self._init_weights() + self.norm2 = nn.LayerNorm(dim, weight_attr=w_attr_2, bias_attr=b_attr_2, epsilon=1e-6) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), dropout=dropout) @@ -367,6 +394,11 @@ def __init__(self, dtype='float32', default_initializer=nn.initializer.Constant(init_values)) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): h = x x = self.norm1(x) @@ -469,8 +501,23 @@ def __init__(self, layer_list.append(copy.deepcopy(block_layers)) self.blocks_token_only = nn.LayerList(layer_list) - self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) - self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else Identity() + w_attr_1, b_attr_1 = self._init_weights_norm() + self.norm = nn.LayerNorm(embed_dim, weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6) + w_attr_2, b_attr_2 = self._init_weights_linear() + self.head = nn.Linear(embed_dim, + num_classes, + weight_attr=w_attr_2, + bias_attr=b_attr_2) if num_classes > 0 else Identity() + + def _init_weights_norm(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + + def _init_weights_linear(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr def forward_features(self, x): # Patch Embedding diff --git a/image_classification/CaiT/config.py b/image_classification/CaiT/config.py index 1703b5ab..0e298229 100644 --- a/image_classification/CaiT/config.py +++ b/image_classification/CaiT/config.py @@ -36,7 +36,7 @@ _C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode _C.DATA.NUM_WORKERS = 2 # number of data loading threads _C.DATA.IMAGENET_MEAN = [0.485, 0.456, 0.406] # [0.5, 0.5, 0.5] -_C.DATA.IMAGENET_STD = [0.229, 0.224, 0.225] # [0.5, 0.5, 0.5] +_C.DATA.IMAGENET_STD = [0.229, 0.224, 0.225] # [0.5, 0.5, 0.5] # model settings _C.MODEL = CN() @@ -45,7 +45,8 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 -_C.MODEL.DROPOUT = 0.1 +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.DROPPATH = 0.1 _C.MODEL.ATTENTION_DROPOUT = 0.0 # transformer settings @@ -65,13 +66,16 @@ _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 _C.TRAIN.NUM_EPOCHS = 300 -_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size -_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune -_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune -_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 -_C.TRAIN.END_LR = 5e-4 -_C.TRAIN.GRAD_CLIP = 1.0 -_C.TRAIN.ACCUM_ITER = 2 #1 +_C.TRAIN.WARMUP_EPOCHS = 5 +_C.TRAIN.WEIGHT_DECAY = 0.05 +_C.TRAIN.BASE_LR = 0.0005 +_C.TRAIN.WARMUP_START_LR = 1e-6 +_C.TRAIN.END_LR = 1e-5 +_C.TRAIN.GRAD_CLIP = None +_C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.MODEL_EMA = True +_C.TRAIN.MODEL_EMA_DECAY = 0.99996 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -85,13 +89,36 @@ _C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) # for adamW _C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 +# train augmentation +_C.TRAIN.MIXUP_ALPHA = 0.8 # mixup alpha, enabled if >0 +_C.TRAIN.CUTMIX_ALPHA = 1.0 # cutmix alpha, enabled if >0 +_C.TRAIN.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha +_C.TRAIN.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled +_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled +_C.TRAIN.MIXUP_MODE = 'batch' # how to apply mixup/cutmix params, per 'batch', 'pair' or 'elem' + +_C.TRAIN.SMOOTHING = 0.1 +_C.TRAIN.COLOR_JITTER = 0.4 # color jitter factor +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True + +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 # random erase prob +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # random erase mode +_C.TRAIN.RANDOM_ERASE_COUNT = 1 # random erase count +_C.TRAIN.RANDOM_ERASE_SPLIT = False + +_C.TRAIN.DISTILLATION_TYPE = 'hard' # hard, soft, none +_C.TRAIN.DISTILLATION_ALPHA = 0.5 +_C.TRAIN.DISTILLATION_TAU = 1.0 + + # misc _C.SAVE = "./output" _C.TAG = "default" -_C.SAVE_FREQ = 5 # freq to save chpt -_C.REPORT_FREQ = 100 # freq to logging info -_C.VALIDATE_FREQ = 100 # freq to do validation -_C.SEED = 0 +_C.SAVE_FREQ = 1 # freq to save chpt +_C.REPORT_FREQ = 50 # freq to logging info +_C.VALIDATE_FREQ = 10 # freq to do validation +_C.SEED = 42 _C.EVAL = False # run evaluation only _C.AMP = False # mix precision training _C.LOCAL_RANK = 0 diff --git a/image_classification/CaiT/datasets.py b/image_classification/CaiT/datasets.py index 40a99fc6..e06767df 100644 --- a/image_classification/CaiT/datasets.py +++ b/image_classification/CaiT/datasets.py @@ -19,8 +19,15 @@ import os import math +from PIL import Image from paddle.io import Dataset, DataLoader, DistributedBatchSampler from paddle.vision import transforms, datasets, image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from random_erasing import RandomErasing + class ImageNet2012Dataset(Dataset): """Build ImageNet2012 dataset @@ -60,7 +67,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') + data = Image.open(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -79,13 +86,36 @@ def get_train_transforms(config): Returns: transforms_train: training transforms """ - - transforms_train = transforms.Compose([ + aug_op_list = [] + # random crop and resize + aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) + scale=(0.05, 1.0))) + # auto_augment / color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + + transforms_train = transforms.Compose(aug_op_list) return transforms_train diff --git a/image_classification/CaiT/losses.py b/image_classification/CaiT/losses.py new file mode 100644 index 00000000..04377eac --- /dev/null +++ b/image_classification/CaiT/losses.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Implement Loss functions """ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class LabelSmoothingCrossEntropyLoss(nn.Layer): + """ cross entropy loss for label smoothing + Args: + smoothing: float, label smoothing rate + x: tensor, predictions (default is before softmax) with shape [N, num_classes] as default + target: tensor, target label with shape [N] as default + weight: tensor, optional, a manual rescaling weight given to each class + reduction: str, optional, indicate how to average the loss by batch_size, + default is ``'mean'``, the candicates are ``'none'`` | ``'mean'`` | ``'sum'`` + axis: int, optional, the index of dimension to perform softmax calculations, + default is ``-1``, if `axis` is not -1 -> the shape of x and target may not be default + use_softmax: bool, optional, if `use_softmax` is ``False``, ``x`` should be after softmax, + default is ``True``, the candicates are ``True`` | ``False`` + name: str, optional, the name of the operator, default is ``None``, + for more information, please refer to :ref:`api_guide_Name`. + Return: + loss: float, cross entropy loss value + """ + def __init__(self, + smoothing=0.1, + weight=None, + reduction='mean', + axis=-1, + use_softmax=True, + name=None): + super().__init__() + assert 0 <= smoothing < 1.0 + self.smoothing = smoothing + self.weight = weight + self.reduction = reduction + self.axis = axis + self.use_softmax = use_softmax + self.name = name + + def forward(self, x, target): + target = paddle.nn.functional.one_hot(target, num_classes=x.shape[1]) + target = paddle.nn.functional.label_smooth(target, epsilon=self.smoothing) + loss = paddle.nn.functional.cross_entropy( + x, + target, + weight=self.weight, + reduction=self.reduction, + soft_label=True, + axis=self.axis, + use_softmax=self.use_softmax, + name=self.name) + return loss + + +class SoftTargetCrossEntropyLoss(nn.Layer): + """ cross entropy loss for soft target + Args: + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, soft target with shape [N, num_classes] + Returns: + loss: float, the mean loss value + """ + def __init__(self): + super().__init__() + + def forward(self, x, target): + loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1) + return loss.mean() + + +class DistillationLoss(nn.Layer): + """Distillation loss function + This layer includes the orginal loss (criterion) and a extra + distillation loss (criterion), which computes the loss with + different type options, between current model and + a teacher model as its supervision. + + Args: + base_criterion: nn.Layer, the original criterion + teacher_model: nn.Layer, the teacher model as supervision + distillation_type: str, one of ['none', 'soft', 'hard'] + alpha: float, ratio of base loss (* (1-alpha)) + and distillation loss( * alpha) + tao: float, temperature in distillation + """ + def __init__(self, + base_criterion, + teacher_model, + distillation_type, + alpha, + tau): + super().__init__() + assert distillation_type in ['none', 'soft', 'hard'] + self.base_criterion = base_criterion + self.teacher_model = teacher_model + self.type = distillation_type + self.alpha = alpha + self.tau = tau + + def forward(self, inputs, outputs, targets): + """ + Args: + inputs: tensor, the orginal model inputs + outputs: tensor, the outputs of the model + outputds_kd: tensor, the distillation outputs of the model, + this is usually obtained by a separate branch + in the last layer of the model + targets: tensor, the labels for the base criterion + """ + outputs, outputs_kd = outputs[0], outputs[1] + base_loss = self.base_criterion(outputs, targets) + if self.type == 'none': + return base_loss + + with paddle.no_grad(): + teacher_outputs = self.teacher_model(inputs) + + if self.type == 'soft': + distillation_loss = F.kl_div( + F.log_softmax(outputs_kd / self.tau, axis=1), + F.log_softmax(teacher_outputs / self.tau, axis=1), + reduction='sum') * (self.tau * self.tau) / outputs_kd.numel() + elif self.type == 'hard': + distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(axis=1)) + + loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha + return loss + + diff --git a/image_classification/CaiT/main_multi_gpu.py b/image_classification/CaiT/main_multi_gpu.py index 8c300633..b0a3b1af 100644 --- a/image_classification/CaiT/main_multi_gpu.py +++ b/image_classification/CaiT/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,53 +27,54 @@ import paddle.distributed as dist from datasets import get_dataloader from datasets import get_dataset -from cait import build_cait as build_model from utils import AverageMeter from utils import WarmupCosineScheduler +from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from model_ema import ModelEma +from cait import build_cait as build_model -parser = argparse.ArgumentParser('CaiT') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -arguments = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, arguments) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('CaiT') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -81,28 +82,43 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + model_ema=None, + mixup_fn=None, + amp=False, + local_logger=None, + master_logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch debug_steps: int, num of iters to log info, default: 100 accum_iter: int, num of iters for accumulating gradients, default: 1 + model_ema: ModelEma, model moving average instance + mixup_fn: Mixup, mixup instance, default: None amp: bool, if True, use mix precision training, default: False + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: - train_loss_meter.avg - train_acc_meter.avg - train_time + train_loss_meter.avg: float, average loss on current process/gpu + train_acc_meter.avg: float, average top1 accuracy on current process/gpu + master_train_loss_meter.avg: float, average loss on all processes/gpus + master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus + train_time: float, training time """ model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + master_train_loss_meter = AverageMeter() + master_train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() @@ -110,24 +126,26 @@ def train(dataloader, for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function # Here no division is needed: # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean' - # #loss = loss / accum_iter loss.backward() @@ -135,42 +153,86 @@ def train(dataloader, optimizer.step() optimizer.clear_grad() + if model_ema is not None and dist.get_rank() == 0: + model_ema.update(model) + pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) - batch_size = image.shape[0] - train_loss_meter.update(loss.numpy()[0], batch_size) - train_acc_meter.update(acc.numpy()[0], batch_size) + batch_size = paddle.to_tensor(image.shape[0]) - if batch_id % debug_steps == 0: - logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {train_loss_meter.avg:.4f}, " + - f"Avg Acc: {train_acc_meter.avg:.4f}") + # sync from other gpus for overall loss and acc + master_loss = loss.clone() + master_acc = acc.clone() + master_batch_size = batch_size.clone() + dist.all_reduce(master_loss) + dist.all_reduce(master_acc) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc = master_acc / dist.get_world_size() + master_train_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_train_acc_meter.update(master_acc.numpy()[0], master_batch_size.numpy()[0]) - train_time = time.time() - time_st - return train_loss_meter.avg, train_acc_meter.avg, train_time + train_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) + train_acc_meter.update(acc.numpy()[0], batch_size.numpy()[0]) + if batch_id % debug_steps == 0: + if local_logger: + local_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {train_loss_meter.avg:.4f}, " + + f"Avg Acc: {train_acc_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_train_loss_meter.avg:.4f}, " + + f"Avg Acc: {master_train_acc_meter.avg:.4f}") -def validate(dataloader, model, criterion, total_batch, debug_steps=100): + train_time = time.time() - time_st + return (train_loss_meter.avg, + train_acc_meter.avg, + master_train_loss_meter.avg, + master_train_acc_meter.avg, + train_time) + + +def validate(dataloader, + model, + criterion, + total_batch, + debug_steps=100, + local_logger=None, + master_logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + debug_steps: int, num of iters to log info, default: 100 + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + master_val_loss_meter.avg: float, average loss on all processes/gpus + master_val_acc1_meter.avg: float, average top1 accuracy on all processes/gpus + master_val_acc5_meter.avg: float, average top5 accuracy on all processes/gpus + val_time: float, validation time """ model.eval() val_loss_meter = AverageMeter() val_acc1_meter = AverageMeter() val_acc5_meter = AverageMeter() + master_val_loss_meter = AverageMeter() + master_val_acc1_meter = AverageMeter() + master_val_acc5_meter = AverageMeter() time_st = time.time() with paddle.no_grad(): @@ -185,63 +247,144 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1)) acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5) - dist.all_reduce(loss) - dist.all_reduce(acc1) - dist.all_reduce(acc5) - loss = loss / dist.get_world_size() - acc1 = acc1 / dist.get_world_size() - acc5 = acc5 / dist.get_world_size() - batch_size = paddle.to_tensor(image.shape[0]) - dist.all_reduce(batch_size) + + master_loss = loss.clone() + master_acc1 = acc1.clone() + master_acc5 = acc5.clone() + master_batch_size = batch_size.clone() + + dist.all_reduce(master_loss) + dist.all_reduce(master_acc1) + dist.all_reduce(master_acc5) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc1 = master_acc1 / dist.get_world_size() + master_acc5 = master_acc5 / dist.get_world_size() + + master_val_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc1_meter.update(master_acc1.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc5_meter.update(master_acc5.numpy()[0], master_batch_size.numpy()[0]) val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0]) val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0]) if batch_id % debug_steps == 0: - logger.info( - f"Val Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {val_loss_meter.avg:.4f}, " + - f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + - f"Avg Acc@5: {val_acc5_meter.avg:.4f}") - + if local_logger: + local_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {val_acc5_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {master_val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {master_val_acc5_meter.avg:.4f}") val_time = time.time() - time_st - return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time + return (val_loss_meter.avg, + val_acc1_meter.avg, + val_acc5_meter.avg, + master_val_loss_meter.avg, + master_val_acc1_meter.avg, + master_val_acc5_meter.avg, + val_time) def main_worker(*args): - # 0. Preparation + # STEP 0: Preparation + config = args[0] dist.init_parallel_env() last_epoch = config.TRAIN.LAST_EPOCH - world_size = paddle.distributed.get_world_size() - local_rank = paddle.distributed.get_rank() - logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + world_size = dist.get_world_size() + local_rank = dist.get_rank() seed = config.SEED + local_rank paddle.seed(seed) np.random.seed(seed) random.seed(seed) - # 1. Create model + # logger for each process/gpu + local_logger = get_logger( + filename=os.path.join(config.SAVE, 'log_{}.txt'.format(local_rank)), + logger_name='local_logger') + # overall logger + if local_rank == 0: + master_logger = get_logger( + filename=os.path.join(config.SAVE, 'log.txt'), + logger_name='master_logger') + master_logger.info(f'\n{config}') + else: + master_logger = None + local_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + if local_rank == 0: + master_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + + # STEP 1: Create model model = build_model(config) + # define model ema + model_ema = None + if not config.EVAL and config.TRAIN.MODEL_EMA and local_rank == 0: + model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY) model = paddle.DataParallel(model) - # 2. Create train and val dataloader - dataset_train, dataset_val = args[0], args[1] + + # STEP 2: Create train and val dataloader + dataset_train, dataset_val = args[1], args[2] # Create training dataloader if not config.EVAL: dataloader_train = get_dataloader(config, dataset_train, 'train', True) total_batch_train = len(dataloader_train) - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + local_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') if local_rank == 0: - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + master_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') # Create validation dataloader dataloader_val = get_dataloader(config, dataset_val, 'test', True) total_batch_val = len(dataloader_val) - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + local_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') if local_rank == 0: - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define optimizer and lr_scheduler + master_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -263,7 +406,9 @@ def main_worker(*args): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + local_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") if config.TRAIN.OPTIMIZER.NAME == "SGD": @@ -290,80 +435,132 @@ def main_worker(*args): weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip, - #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']), + apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + 'absolute_pos_embed', 'relative_position_bias_table']), ) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 5. Load pretrained model / load resumt model and optimizer states + # STEP 6: Load pretrained model / load resumt model and optimizer states if config.MODEL.PRETRAINED: if (config.MODEL.PRETRAINED).endswith('.pdparams'): raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) - logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + local_logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + if local_rank == 0: + master_logger.info( + f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) - logger.info( - f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}") + local_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + if local_rank == 0: + master_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + # load ema model + if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): + model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') + model_ema.module.set_state_dict(model_ema_state) + local_logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') + if local_rank == 0: + master_logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') - # 6. Validation + # STEP 7: Validation (eval mode) if config.EVAL: - logger.info('----- Start Validating') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info('----- Start Validating') + if local_rank == 0: + master_logger.info('----- Start Validating') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") return - # 6. Start training and validation - logging.info(f"Start training from epoch {last_epoch+1}.") + # STEP 8: Start training and validation (train mode) + local_logger.info(f"Start training from epoch {last_epoch+1}.") + if local_rank == 0: + master_logger.info(f"Start training from epoch {last_epoch+1}.") for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") - train_loss, train_acc, train_time = train(dataloader=dataloader_train, - model=model, - criterion=criterion, - optimizer=optimizer, - epoch=epoch, - total_batch=total_batch_train, - debug_steps=config.REPORT_FREQ, - accum_iter=config.TRAIN.ACCUM_ITER, - amp=config.AMP) + local_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + if local_rank == 0: + master_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + train_loss, train_acc, avg_loss, avg_acc, train_time = train( + dataloader=dataloader_train, + model=model, + criterion=criterion, + optimizer=optimizer, + epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, + total_batch=total_batch_train, + debug_steps=config.REPORT_FREQ, + accum_iter=config.TRAIN.ACCUM_ITER, + model_ema=model_ema, + mixup_fn=mixup_fn, + amp=config.AMP, + local_logger=local_logger, + master_logger=master_logger) + scheduler.step() - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Train Loss: {train_loss:.4f}, " + - f"Train Acc: {train_acc:.4f}, " + - f"time: {train_time:.2f}") + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {train_loss:.4f}, " + + f"Train Acc: {train_acc:.4f}, " + + f"time: {train_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {avg_loss:.4f}, " + + f"Train Acc: {avg_acc:.4f}, " + + f"time: {train_time:.2f}") + # validation if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: - logger.info(f'----- Validation after Epoch: {epoch}') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info(f'----- Validation after Epoch: {epoch}') + if local_rank == 0: + master_logger.info(f'----- Validation after Epoch: {epoch}') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") # model save if local_rank == 0: if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: @@ -371,18 +568,38 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - logger.info(f"----- Save model: {model_path}.pdparams") - logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") + if model_ema is not None: + model_ema_path = os.path.join( + config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA") + paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams') + master_logger.info(f"----- Save ema model: {model_ema_path}.pdparams") def main(): + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) + + # get dataset and start DDP if not config.EVAL: dataset_train = get_dataset(config, mode='train') else: dataset_train = None dataset_val = get_dataset(config, mode='val') config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS - dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS) + dist.spawn(main_worker, args=(config, dataset_train, dataset_val, ), nprocs=config.NGPUS) if __name__ == "__main__": diff --git a/image_classification/CaiT/main_single_gpu.py b/image_classification/CaiT/main_single_gpu.py index 5909ab7f..a1cecfd9 100644 --- a/image_classification/CaiT/main_single_gpu.py +++ b/image_classification/CaiT/main_single_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,55 +26,53 @@ import paddle.nn.functional as F from datasets import get_dataloader from datasets import get_dataset -from cait import build_cait as build_model from utils import AverageMeter from utils import WarmupCosineScheduler +from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from cait import build_cait as build_model -parser = argparse.ArgumentParser('CaiT') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -args = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, args) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -#config.freeze() - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('CaiT') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -82,49 +80,59 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + model_ema=None, + mixup_fn=None, + amp=False, + logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info - accum_iter: int, num of iters for accumulating gradients - amp: bool, if True, use mix precision training + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + accum_iter: int, num of iters for accumulating gradients, default: 1 + model_ema: ModelEma, model moving average instance + mixup_fn: Mixup, mixup instance, default: None + amp: bool, if True, use mix precision training, default: False + logger: logger for logging, default: None Returns: - train_loss_meter.avg - train_acc_meter.avg - train_time + train_loss_meter.avg: float, average loss on current process/gpu + train_acc_meter.avg: float, average top1 accuracy on current process/gpu + train_time: float, training time """ model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() - for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function @@ -137,16 +145,22 @@ def train(dataloader, optimizer.step() optimizer.clear_grad() + if model_ema is not None: + model_ema.update(model) + pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) batch_size = image.shape[0] train_loss_meter.update(loss.numpy()[0], batch_size) train_acc_meter.update(acc.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + f"Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {train_loss_meter.avg:.4f}, " + f"Avg Acc: {train_acc_meter.avg:.4f}") @@ -155,19 +169,20 @@ def train(dataloader, return train_loss_meter.avg, train_acc_meter.avg, train_time -def validate(dataloader, model, criterion, total_batch, debug_steps=100): +def validate(dataloader, model, criterion, total_batch, debug_steps=100, logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + logger: logger for logging, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + val_time: float, valitaion time """ model.eval() val_loss_meter = AverageMeter() @@ -192,7 +207,7 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): val_acc1_meter.update(acc1.numpy()[0], batch_size) val_acc5_meter.update(acc5.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( f"Val Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {val_loss_meter.avg:.4f}, " + @@ -204,25 +219,81 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): def main(): - # 0. Preparation + # STEP 0: Preparation + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) last_epoch = config.TRAIN.LAST_EPOCH seed = config.SEED paddle.seed(seed) np.random.seed(seed) random.seed(seed) - #paddle.set_device('gpu:0') - # 1. Create model + logger = get_logger(filename=os.path.join(config.SAVE, 'log.txt')) + logger.info(f'\n{config}') + + # STEP 1: Create model model = build_model(config) - # 2. Create train and val dataloader + # define model ema + model_ema = None + if not config.EVAL and config.TRAIN.MODEL_EMA: + model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY) + + # STEP 2: Create train and val dataloader if not config.EVAL: dataset_train = get_dataset(config, mode='train') dataloader_train = get_dataloader(config, dataset_train, 'train', False) dataset_val = get_dataset(config, mode='val') dataloader_val = get_dataloader(config, dataset_val, 'val', False) - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define lr_scheduler + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -231,8 +302,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -244,9 +314,9 @@ def main(): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") - # 5. Define optimizer + if config.TRAIN.OPTIMIZER.NAME == "SGD": if config.TRAIN.GRAD_CLIP: clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP) @@ -266,59 +336,76 @@ def main(): optimizer = paddle.optimizer.AdamW( parameters=model.parameters(), learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR, - weight_decay=config.TRAIN.WEIGHT_DECAY, beta1=config.TRAIN.OPTIMIZER.BETAS[0], beta2=config.TRAIN.OPTIMIZER.BETAS[1], + weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, - grad_clip=clip) + grad_clip=clip, + apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + 'absolute_pos_embed', 'relative_position_bias_table']), + ) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 6. Load pretrained model or load resume model and optimizer states + + # STEP 6: Load pretrained model or load resume model and optimizer states if config.MODEL.PRETRAINED: - assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') + if (config.MODEL.PRETRAINED).endswith('.pdparams'): + raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') + assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) logger.info( - f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") - # 7. Validation + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + # load ema model + if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): + model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') + model_ema.module.set_state_dict(model_ema_state) + logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') + + # STEP 7: Validation (eval mode) if config.EVAL: logger.info('----- Start Validating') val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + f"Validation Acc@5: {val_acc5:.4f}, " + f"time: {val_time:.2f}") return - # 8. Start training and validation - logging.info(f"Start training from epoch {last_epoch+1}.") + + # STEP 8: Start training and validation (train mode) + logger.info(f"Start training from epoch {last_epoch+1}.") for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") train_loss, train_acc, train_time = train(dataloader=dataloader_train, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, total_batch=len(dataloader_train), debug_steps=config.REPORT_FREQ, accum_iter=config.TRAIN.ACCUM_ITER, + model_ema=model_ema, + mixup_fn=mixup_fn, amp=config.AMP, - ) + logger=logger) scheduler.step() logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Train Loss: {train_loss:.4f}, " + @@ -330,9 +417,10 @@ def main(): val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + @@ -346,6 +434,11 @@ def main(): paddle.save(optimizer.state_dict(), model_path + '.pdopt') logger.info(f"----- Save model: {model_path}.pdparams") logger.info(f"----- Save optim: {model_path}.pdopt") + if model_ema is not None: + model_ema_path = os.path.join( + config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA") + paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams') + logger.info(f"----- Save ema model: {model_ema_path}.pdparams") if __name__ == "__main__": diff --git a/image_classification/CaiT/mixup.py b/image_classification/CaiT/mixup.py new file mode 100644 index 00000000..1d2db493 --- /dev/null +++ b/image_classification/CaiT/mixup.py @@ -0,0 +1,225 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""mixup and cutmix for batch data""" +import numpy as np +import paddle + + +def rand_bbox(image_shape, lam, count=None): + """ CutMix bbox by lam value + Generate 1 random bbox by value lam. lam is the cut size rate. + The cut_size is computed by sqrt(1-lam) * image_size. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + count: int, number of bbox to generate + """ + image_h, image_w = image_shape[-2:] + cut_rate = np.sqrt(1. - lam) + cut_h = int(cut_rate * image_h) + cut_w = int(cut_rate * image_w) + + # get random bbox center + cy = np.random.randint(0, image_h, size=count) + cx = np.random.randint(0, image_w, size=count) + + # get bbox coords + bbox_x1 = np.clip(cx - cut_w // 2, 0, image_w) + bbox_y1 = np.clip(cy - cut_h // 2, 0, image_h) + bbox_x2 = np.clip(cx + cut_w // 2, 0, image_w) + bbox_y2 = np.clip(cy + cut_h // 2, 0, image_h) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # while in pytorch, it will return [] tensor + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def rand_bbox_minmax(image_shape, minmax, count=None): + """ CutMix bbox by min and max value + Generate 1 random bbox by min and max percentage values. + Minmax is a tuple/list of min and max percentage vlaues + applied to the image width and height. + + Args: + image_shape: tuple/list, image height and width + minmax: tuple/list, min and max percentage values of image size + count: int, number of bbox to generate + """ + assert len(minmax) == 2 + image_h, image_w = image_shape[-2:] + min_ratio = minmax[0] + max_ratio = minmax[1] + cut_h = np.random.randint(int(image_h * min_ratio), int(image_h * max_ratio), size=count) + cut_w = np.random.randint(int(image_w * min_ratio), int(image_w * max_ratio), size=count) + + bbox_x1 = np.random.randint(0, image_w - cut_w, size=count) + bbox_y1 = np.random.randint(0, image_h - cut_h, size=count) + bbox_x2 = bbox_x1 + cut_w + bbox_y2 = bbox_y1 + cut_h + + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def cutmix_generate_bbox_adjust_lam(image_shape, lam, minmax=None, correct_lam=True, count=None): + """Generate bbox and apply correction for lambda + If the mimmax is None, apply the standard cutmix by lam value, + If the minmax is set, apply the cutmix by min and max percentage values. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + minmax: tuple/list, min and max percentage values of image size + correct_lam: bool, if True, correct the lam value by the generated bbox + count: int, number of bbox to generate + """ + if minmax is not None: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox_minmax(image_shape, minmax, count) + else: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox(image_shape, lam, count) + + if correct_lam or minmax is not None: + image_h, image_w = image_shape[-2:] + bbox_area = (bbox_y2 - bbox_y1) * (bbox_x2 - bbox_x1) + lam = 1. - bbox_area / float(image_h * image_w) + return (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam + + +def one_hot(x, num_classes, on_value=1., off_value=0.): + """ Generate one-hot vector for label smoothing + Args: + x: tensor, contains label/class indices + num_classes: int, num of classes (len of the one-hot vector) + on_value: float, the vector value at label index, default=1. + off_value: float, the vector value at non-label indices, default=0. + Returns: + one_hot: tensor, tensor with on value at label index and off value + at non-label indices. + """ + x = x.reshape_([-1, 1]) + x_smoothed = paddle.full((x.shape[0], num_classes), fill_value=off_value) + for i in range(x.shape[0]): + x_smoothed[i, x[i]] = on_value + return x_smoothed + + +def mixup_one_hot(label, num_classes, lam=1., smoothing=0.): + """ mixup and label smoothing in batch + label smoothing is firstly applied, then + mixup is applied by mixing the bacth and its flip, + with a mixup rate. + + Args: + label: tensor, label tensor with shape [N], contains the class indices + num_classes: int, num of all classes + lam: float, mixup rate, default=1.0 + smoothing: float, label smoothing rate + """ + off_value = smoothing / num_classes + on_value = 1. - smoothing + off_value + y1 = one_hot(label, num_classes, on_value, off_value) + y2 = one_hot(label.flip(axis=[0]), num_classes, on_value, off_value) + return y2 * (1 - lam) + y1 * lam + + +class Mixup: + """Mixup class + Args: + mixup_alpha: float, mixup alpha for beta distribution, default=1.0, + cutmix_alpha: float, cutmix alpha for beta distribution, default=0.0, + cutmix_minmax: list/tuple, min and max value for cutmix ratio, default=None, + prob: float, if random prob < prob, do not use mixup, default=1.0, + switch_prob: float, prob of switching mixup and cutmix, default=0.5, + mode: string, mixup up, now only 'batch' is supported, default='batch', + correct_lam: bool, if True, apply correction of lam, default=True, + label_smoothing: float, label smoothing rate, default=0.1, + num_classes: int, num of classes, default=1000 + """ + def __init__(self, + mixup_alpha=1.0, + cutmix_alpha=0.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode='batch', + correct_lam=True, + label_smoothing=0.1, + num_classes=1000): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if cutmix_minmax is not None: + assert len(cutmix_minmax) == 2 + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = correct_lam + assert mode == 'batch', 'Now only batch mode is supported!' + + def __call__(self, x, target): + assert x.shape[0] % 2 == 0, "Batch size should be even" + lam = self._mix_batch(x) + target = mixup_one_hot(target, self.num_classes, lam, self.label_smoothing) + return x, target + + def get_params(self): + """Decide to use cutmix or regular mixup by sampling and + sample lambda for mixup + """ + lam = 1. + use_cutmix = False + use_mixup = np.random.rand() < self.mix_prob + if use_mixup: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand() < self.switch_prob + alpha = self.cutmix_alpha if use_cutmix else self.mixup_alpha + lam_mix = np.random.beta(alpha, alpha) + elif self.mixup_alpha == 0. and self.cutmix_alpha > 0.: + use_cutmix=True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + elif self.mixup_alpha > 0. and self.cutmix_alpha == 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + else: + raise ValueError('mixup_alpha and cutmix_alpha cannot be all 0') + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_batch(self, x): + """mixup/cutmix by adding batch data and its flipped version""" + lam, use_cutmix = self.get_params() + if lam == 1.: + return lam + if use_cutmix: + (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam = cutmix_generate_bbox_adjust_lam( + x.shape, + lam, + minmax=self.cutmix_minmax, + correct_lam=self.correct_lam) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # but in pytorch, it will return [] tensor without errors + if int(bbox_x1) != int(bbox_x2) and int(bbox_y1) != int(bbox_y2): + x[:, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] = x.flip(axis=[0])[ + :, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] + else: + x_flipped = x.flip(axis=[0]) + x_flipped = x_flipped * (1 - lam) + x.set_value(x * (lam) + x_flipped) + return lam diff --git a/image_classification/CaiT/model_ema.py b/image_classification/CaiT/model_ema.py new file mode 100644 index 00000000..8a636765 --- /dev/null +++ b/image_classification/CaiT/model_ema.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Implement the Exponential Model Averaging +This is paddle hack from: +https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py +""" + +import copy +from collections import OrderedDict +import paddle +import paddle.nn as nn + + +class ModelEma: + """Model Ema + A moving average is kept of model weights and buffers. + Note that for multiple gpu, ema must be defined after mode init, + but before DataParallel. + + Args: + model: nn.Layer, original modela with learnable params + decay: float, decay rate for each update, default: 0.999 + """ + def __init__(self, model, decay=0.999): + self.module = copy.deepcopy(model) + self.module.eval() + self.decay = decay + + @paddle.no_grad() + def _update(self, model, update_fn): + # update ema model parameters by model parameters + for (_, ema_param), (_, model_param) in zip( + self.module.named_parameters(), model.named_parameters()): + ema_param.set_value(copy.deepcopy(update_fn(ema_param, model_param))) + + # update ema model buffers by model buffers + for (_, ema_buf), (_, model_buf) in zip( + self.module.named_buffers(), model.named_buffers()): + ema_buf.set_value(copy.deepcopy(update_fn(ema_buf, model_buf))) + + def update(self, model): + self._update(model, update_fn=lambda e, m: self.decay * e + (1 - self.decay) * m) + + def set(self, model): + self._update(model, update_fn=lambda e, m: m) + + def state_dict(self): + return self.module.state_dict() + diff --git a/image_classification/CaiT/random_erasing.py b/image_classification/CaiT/random_erasing.py new file mode 100644 index 00000000..31eea465 --- /dev/null +++ b/image_classification/CaiT/random_erasing.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Random Erasing for image tensor""" + +import random +import math +import paddle + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"): + if per_pixel: + return paddle.normal(shape=patch_size).astype(dtype) + if rand_color: + return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype) + return paddle.zeros((patch_size[0], 1, 1)).astype(dtype) + + +class RandomErasing(object): + """ + Args: + prob: probability of performing random erasing + min_area: Minimum percentage of erased area wrt input image area + max_area: Maximum percentage of erased area wrt input image area + min_aspect: Minimum aspect ratio of earsed area + max_aspect: Maximum aspect ratio of earsed area + mode: pixel color mode, in ['const', 'rand', 'pixel'] + 'const' - erase block is constant valued 0 for all channels + 'rand' - erase block is valued random color (same per-channel) + 'pixel' - erase block is vauled random color per pixel + min_count: Minimum # of ereasing blocks per image. + max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count + per-image count is randomly chosen between min_count to max_count + """ + def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0): + self.prob = prob + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == "rand": + self.rand_color = True + elif mode == "pixel": + self.per_pixel = True + else: + assert not mode or mode == "const" + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.prob: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top+h, left:left+w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype) + break + + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) + else: + batch_size, chan, img_h, img_w = input.shape + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input + + + +#def main(): +# re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel') +# import PIL.Image as Image +# import numpy as np +# paddle.set_device('cpu') +# img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32') +# img = img / 255.0 +# img = paddle.transpose(img, [2, 0, 1]) +# new_img = re(img) +# new_img = new_img * 255.0 +# new_img = paddle.transpose(new_img, [1, 2, 0]) +# new_img = new_img.cpu().numpy() +# new_img = Image.fromarray(new_img.astype('uint8')) +# new_img.save('./res.png') +# +# +# +#if __name__ == "__main__": +# main() diff --git a/image_classification/ConvMLP/augment.py b/image_classification/ConvMLP/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/ConvMLP/augment.py +++ b/image_classification/ConvMLP/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/ConvMLP/config.py b/image_classification/ConvMLP/config.py index 982be600..fee70c77 100644 --- a/image_classification/ConvMLP/config.py +++ b/image_classification/ConvMLP/config.py @@ -46,8 +46,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 -_C.MODEL.DROPPATH = 0. -_C.MODEL.DROPOUT = 0. +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.ATTENTION_DROPOUT = 0.0 +_C.MODEL.DROP_PATH = 0.1 # transformer settings _C.MODEL.MIXER = CN() @@ -63,13 +64,11 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 20 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.BASE_LR = 0.001 _C.TRAIN.WARMUP_START_LR = 5e-7 _C.TRAIN.END_LR = 5e-6 _C.TRAIN.GRAD_CLIP = 5.0 _C.TRAIN.ACCUM_ITER = 1 -_C.TRAIN.MODEL_EMA = True -_C.TRAIN.MODEL_EMA_DECAY = 0.99996 _C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() @@ -94,36 +93,23 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" _C.SAVE_FREQ = 1 # freq to save chpt _C.REPORT_FREQ = 50 # freq to logging info -_C.VALIDATE_FREQ = 10 # freq to do validation +_C.VALIDATE_FREQ = 20 # freq to do validation _C.SEED = 0 _C.EVAL = False # run evaluation only -_C.AMP = False +_C.AMP = False # mix precision training _C.LOCAL_RANK = 0 _C.NGPUS = -1 @@ -157,6 +143,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/ConvMLP/configs/convmlp_l.yaml b/image_classification/ConvMLP/configs/convmlp_l.yaml index a350db02..cb472824 100644 --- a/image_classification/ConvMLP/configs/convmlp_l.yaml +++ b/image_classification/ConvMLP/configs/convmlp_l.yaml @@ -10,7 +10,7 @@ MODEL: MLP_RATIOS: [3, 3, 3] CHANNELS: 96 N_CONV_BLOCKS: 3 - DROPPATH: 0.1 + DROP_PATH: 0.1 TRAIN: NUM_EPOCHS: 300 WARMUP_EPOCHS: 5 diff --git a/image_classification/ConvMLP/configs/convmlp_m.yaml b/image_classification/ConvMLP/configs/convmlp_m.yaml index cea2bd28..e47727c9 100644 --- a/image_classification/ConvMLP/configs/convmlp_m.yaml +++ b/image_classification/ConvMLP/configs/convmlp_m.yaml @@ -10,7 +10,7 @@ MODEL: MLP_RATIOS: [3, 3, 3] CHANNELS: 64 N_CONV_BLOCKS: 3 - DROPPATH: 0.1 + DROP_PATH: 0.1 TRAIN: NUM_EPOCHS: 300 WARMUP_EPOCHS: 5 diff --git a/image_classification/ConvMLP/configs/convmlp_s.yaml b/image_classification/ConvMLP/configs/convmlp_s.yaml index c0378578..d8be7da8 100644 --- a/image_classification/ConvMLP/configs/convmlp_s.yaml +++ b/image_classification/ConvMLP/configs/convmlp_s.yaml @@ -11,7 +11,7 @@ MODEL: MLP_RATIOS: [2, 2, 2] CHANNELS: 64 N_CONV_BLOCKS: 2 - DROPPATH: 0.1 + DROP_PATH: 0.1 TRAIN: NUM_EPOCHS: 300 WARMUP_EPOCHS: 5 diff --git a/image_classification/ConvMLP/convmlp.py b/image_classification/ConvMLP/convmlp.py index 8e57683f..92b40004 100644 --- a/image_classification/ConvMLP/convmlp.py +++ b/image_classification/ConvMLP/convmlp.py @@ -311,7 +311,7 @@ def build_convmlp(config): n_conv_blocks=config.MODEL.MIXER.N_CONV_BLOCKS, classifier_head=True, num_classes=config.MODEL.NUM_CLASSES, - droppath=config.MODEL.DROPPATH, + droppath=config.MODEL.DROP_PATH, ) return model diff --git a/image_classification/ConvMLP/datasets.py b/image_classification/ConvMLP/datasets.py index b120fa00..304df9a3 100644 --- a/image_classification/ConvMLP/datasets.py +++ b/image_classification/ConvMLP/datasets.py @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -99,9 +102,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/ConvMLP/droppath.py b/image_classification/ConvMLP/droppath.py index 08472aea..c8fe8048 100644 --- a/image_classification/ConvMLP/droppath.py +++ b/image_classification/ConvMLP/droppath.py @@ -16,10 +16,29 @@ Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth """ -import numpy as np import paddle import paddle.nn as nn +def drop_path(inputs, drop_prob=0., training=False): + """drop path op + Args: + input: tensor with arbitrary shape + drop_prob: float number of drop path probability, default: 0.0 + training: bool, if current mode is training, default: False + Returns: + output: output tensor after drop path + """ + # if prob is 0 or eval mode, return original input + if drop_prob == 0. or not training: + return inputs + keep_prob = 1 - drop_prob + keep_prob = paddle.to_tensor(keep_prob) + shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) + random_tensor = random_tensor.floor() # mask + output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation + return output + class DropPath(nn.Layer): """DropPath class""" @@ -27,36 +46,5 @@ def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob - def drop_path(self, inputs): - """drop path op - Args: - input: tensor with arbitrary shape - drop_prob: float number of drop path probability, default: 0.0 - training: bool, if current mode is training, default: False - Returns: - output: output tensor after drop path - """ - # if prob is 0 or eval mode, return original input - if self.drop_prob == 0. or not self.training: - return inputs - keep_prob = 1 - self.drop_prob - keep_prob = paddle.to_tensor(keep_prob, dtype='float32') - shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) - random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) - random_tensor = random_tensor.floor() # mask - output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation - return output - def forward(self, inputs): - return self.drop_path(inputs) - - -#def main(): -# tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32') -# dp = DropPath(0.5) -# for i in range(100): -# out = dp(tmp) -# print(out) -# -#if __name__ == "__main__": -# main() + return drop_path(inputs, self.drop_prob, self.training) diff --git a/image_classification/ConvMLP/losses.py b/image_classification/ConvMLP/losses.py index f67780a2..082467a3 100644 --- a/image_classification/ConvMLP/losses.py +++ b/image_classification/ConvMLP/losses.py @@ -119,3 +119,5 @@ def forward(self, inputs, outputs, targets): loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha return loss + + diff --git a/image_classification/ConvMLP/main_multi_gpu.py b/image_classification/ConvMLP/main_multi_gpu.py index 8c37f3f3..e91d5efd 100644 --- a/image_classification/ConvMLP/main_multi_gpu.py +++ b/image_classification/ConvMLP/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,14 +29,11 @@ from datasets import get_dataset from utils import AverageMeter from utils import WarmupCosineScheduler -from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss -from model_ema import ModelEma from convmlp import build_convmlp as build_model @@ -50,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -87,7 +85,6 @@ def train(dataloader, total_batch, debug_steps=100, accum_iter=1, - model_ema=None, mixup_fn=None, amp=False, local_logger=None, @@ -107,11 +104,9 @@ def train(dataloader, local_logger: logger for local process/gpu, default: None master_logger: logger for main process, default: None Returns: - train_loss_meter.avg: float, average loss on current process/gpu - train_acc_meter.avg: float, average top1 accuracy on current process/gpu - master_train_loss_meter.avg: float, average loss on all processes/gpus - master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus - train_time: float, training time + train_loss_meter.avg + train_acc_meter.avg + train_time """ model.train() train_loss_meter = AverageMeter() @@ -153,9 +148,6 @@ def train(dataloader, optimizer.step() optimizer.clear_grad() - if model_ema is not None and dist.get_rank() == 0: - model_ema.update(model) - pred = F.softmax(output) if mixup_fn: acc = paddle.metric.accuracy(pred, label_orig) @@ -322,10 +314,6 @@ def main_worker(*args): # STEP 1: Create model model = build_model(config) - # define model ema - model_ema = None - if not config.EVAL and config.TRAIN.MODEL_EMA and local_rank == 0: - model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY) model = paddle.DataParallel(model) # STEP 2: Create train and val dataloader @@ -435,8 +423,8 @@ def main_worker(*args): weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), + #apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -468,13 +456,6 @@ def main_worker(*args): if local_rank == 0: master_logger.info( f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") - # load ema model - if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): - model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') - model_ema.module.set_state_dict(model_ema_state) - local_logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') - if local_rank == 0: - master_logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') # STEP 7: Validation (eval mode) if config.EVAL: @@ -519,7 +500,6 @@ def main_worker(*args): total_batch=total_batch_train, debug_steps=config.REPORT_FREQ, accum_iter=config.TRAIN.ACCUM_ITER, - model_ema=model_ema, mixup_fn=mixup_fn, amp=config.AMP, local_logger=local_logger, @@ -570,11 +550,6 @@ def main_worker(*args): paddle.save(optimizer.state_dict(), model_path + '.pdopt') master_logger.info(f"----- Save model: {model_path}.pdparams") master_logger.info(f"----- Save optim: {model_path}.pdopt") - if model_ema is not None: - model_ema_path = os.path.join( - config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA") - paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams') - master_logger.info(f"----- Save ema model: {model_ema_path}.pdparams") def main(): diff --git a/image_classification/ConvMLP/main_single_gpu.py b/image_classification/ConvMLP/main_single_gpu.py index 8195845f..27e8de97 100644 --- a/image_classification/ConvMLP/main_single_gpu.py +++ b/image_classification/ConvMLP/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,8 +34,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss -from model_ema import ModelEma from convmlp import build_convmlp as build_model @@ -50,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -87,7 +85,6 @@ def train(dataloader, total_batch, debug_steps=100, accum_iter=1, - model_ema=None, mixup_fn=None, amp=False, logger=None): @@ -101,7 +98,6 @@ def train(dataloader, total_batch: int, total num of batches for one epoch debug_steps: int, num of iters to log info, default: 100 accum_iter: int, num of iters for accumulating gradients, default: 1 - model_ema: ModelEma, model moving average instance mixup_fn: Mixup, mixup instance, default: None amp: bool, if True, use mix precision training, default: False logger: logger for logging, default: None @@ -148,9 +144,6 @@ def train(dataloader, optimizer.step() optimizer.clear_grad() - if model_ema is not None: - model_ema.update(model) - pred = F.softmax(output) if mixup_fn: acc = paddle.metric.accuracy(pred, label_orig) @@ -244,10 +237,6 @@ def main(): # STEP 1: Create model model = build_model(config) - # define model ema - model_ema = None - if not config.EVAL and config.TRAIN.MODEL_EMA: - model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY) # STEP 2: Create train and val dataloader if not config.EVAL: @@ -256,7 +245,6 @@ def main(): dataset_val = get_dataset(config, mode='val') dataloader_val = get_dataloader(config, dataset_val, 'val', False) - # STEP 3: Define Mixup function mixup_fn = None if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: @@ -344,10 +332,7 @@ def main(): beta2=config.TRAIN.OPTIMIZER.BETAS[1], weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, - grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), - ) + grad_clip=clip) else: logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -366,16 +351,11 @@ def main(): assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( - f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") - # load ema model - if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): - model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') - model_ema.module.set_state_dict(model_ema_state) - logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') - + f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") + # STEP 7: Validation (eval mode) if config.EVAL: logger.info('----- Start Validating') @@ -406,7 +386,6 @@ def main(): total_batch=len(dataloader_train), debug_steps=config.REPORT_FREQ, accum_iter=config.TRAIN.ACCUM_ITER, - model_ema=model_ema, mixup_fn=mixup_fn, amp=config.AMP, logger=logger) @@ -438,11 +417,6 @@ def main(): paddle.save(optimizer.state_dict(), model_path + '.pdopt') logger.info(f"----- Save model: {model_path}.pdparams") logger.info(f"----- Save optim: {model_path}.pdopt") - if model_ema is not None: - model_ema_path = os.path.join( - config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA") - paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams') - logger.info(f"----- Save ema model: {model_ema_path}.pdparams") if __name__ == "__main__": diff --git a/image_classification/ConvMLP/random_erasing.py b/image_classification/ConvMLP/random_erasing.py index 80d31dd8..31eea465 100644 --- a/image_classification/ConvMLP/random_erasing.py +++ b/image_classification/ConvMLP/random_erasing.py @@ -83,15 +83,15 @@ def _erase(self, img, chan, img_h, img_w, dtype): dtype=dtype) break - def __call__(self, inputs): - if len(inputs.shape) == 3: - self._erase(inputs, *inputs.shape, inputs.dtype) + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) else: - batch_size, chan, img_h, img_w = inputs.shape + batch_size, chan, img_h, img_w = input.shape batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 for i in range(batch_start, batch_size): - self._erase(inputs[i], chan, img_h, img_w, inputs.dtype) - return inputs + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input diff --git a/image_classification/ConvMLP/transforms.py b/image_classification/ConvMLP/transforms.py index 676fe1ff..5a046912 100644 --- a/image_classification/ConvMLP/transforms.py +++ b/image_classification/ConvMLP/transforms.py @@ -1,3 +1,4 @@ +import random import paddle import paddle.nn import paddle.vision.transforms as T diff --git a/image_classification/ConvMLP/utils.py b/image_classification/ConvMLP/utils.py index f5bdb636..44800527 100644 --- a/image_classification/ConvMLP/utils.py +++ b/image_classification/ConvMLP/utils.py @@ -1,120 +1,120 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""utils for ViT - -Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training -and WarmupCosineScheduler for training - -""" - -import math -from paddle.optimizer.lr import LRScheduler - - -class AverageMeter(): - """ Meter for monitoring losses""" - def __init__(self): - self.avg = 0 - self.sum = 0 - self.cnt = 0 - self.reset() - - def reset(self): - """reset all values to zeros""" - self.avg = 0 - self.sum = 0 - self.cnt = 0 - - def update(self, val, n=1): - """update avg by val and n, where val is the avg of n values""" - self.sum += val * n - self.cnt += n - self.avg = self.sum / self.cnt - - - -def get_exclude_from_weight_decay_fn(exclude_list=[]): - """ Set params with no weight decay during the training - - For certain params, e.g., positional encoding in ViT, weight decay - may not needed during the learning, this method is used to find - these params. - - Args: - exclude_list: a list of params names which need to exclude - from weight decay. - Returns: - exclude_from_weight_decay_fn: a function returns True if param - will be excluded from weight decay - """ - if len(exclude_list) == 0: - exclude_from_weight_decay_fn = None - else: - def exclude_fn(param): - for name in exclude_list: - if param.endswith(name): - return False - return True - exclude_from_weight_decay_fn = exclude_fn - return exclude_from_weight_decay_fn - - -class WarmupCosineScheduler(LRScheduler): - """Warmup Cosine Scheduler - - First apply linear warmup, then apply cosine decay schedule. - Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs" - Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining - "total_epochs - warmup_epochs" - - Attributes: - learning_rate: the starting learning rate (without warmup), not used here! - warmup_start_lr: warmup starting learning rate - start_lr: the starting learning rate (without warmup) - end_lr: the ending learning rate after whole loop - warmup_epochs: # of epochs for warmup - total_epochs: # of total epochs (include warmup) - """ - def __init__(self, - learning_rate, - warmup_start_lr, - start_lr, - end_lr, - warmup_epochs, - total_epochs, - cycles=0.5, - last_epoch=-1, - verbose=False): - """init WarmupCosineScheduler """ - self.warmup_epochs = warmup_epochs - self.total_epochs = total_epochs - self.warmup_start_lr = warmup_start_lr - self.start_lr = start_lr - self.end_lr = end_lr - self.cycles = cycles - super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose) - - def get_lr(self): - """ return lr value """ - if self.last_epoch < self.warmup_epochs: - val = (self.start_lr - self.warmup_start_lr) * float( - self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr - return val - - progress = float(self.last_epoch - self.warmup_epochs) / float( - max(1, self.total_epochs - self.warmup_epochs)) - val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) - val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr) - return val +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""utils for ViT + +Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training +and WarmupCosineScheduler for training + +""" + +import math +from paddle.optimizer.lr import LRScheduler + + +class AverageMeter(): + """ Meter for monitoring losses""" + def __init__(self): + self.avg = 0 + self.sum = 0 + self.cnt = 0 + self.reset() + + def reset(self): + """reset all values to zeros""" + self.avg = 0 + self.sum = 0 + self.cnt = 0 + + def update(self, val, n=1): + """update avg by val and n, where val is the avg of n values""" + self.sum += val * n + self.cnt += n + self.avg = self.sum / self.cnt + + + +def get_exclude_from_weight_decay_fn(exclude_list=[]): + """ Set params with no weight decay during the training + + For certain params, e.g., positional encoding in ViT, weight decay + may not needed during the learning, this method is used to find + these params. + + Args: + exclude_list: a list of params names which need to exclude + from weight decay. + Returns: + exclude_from_weight_decay_fn: a function returns True if param + will be excluded from weight decay + """ + if len(exclude_list) == 0: + exclude_from_weight_decay_fn = None + else: + def exclude_fn(param): + for name in exclude_list: + if param.endswith(name): + return False + return True + exclude_from_weight_decay_fn = exclude_fn + return exclude_from_weight_decay_fn + + +class WarmupCosineScheduler(LRScheduler): + """Warmup Cosine Scheduler + + First apply linear warmup, then apply cosine decay schedule. + Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs" + Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining + "total_epochs - warmup_epochs" + + Attributes: + learning_rate: the starting learning rate (without warmup), not used here! + warmup_start_lr: warmup starting learning rate + start_lr: the starting learning rate (without warmup) + end_lr: the ending learning rate after whole loop + warmup_epochs: # of epochs for warmup + total_epochs: # of total epochs (include warmup) + """ + def __init__(self, + learning_rate, + warmup_start_lr, + start_lr, + end_lr, + warmup_epochs, + total_epochs, + cycles=0.5, + last_epoch=-1, + verbose=False): + """init WarmupCosineScheduler """ + self.warmup_epochs = warmup_epochs + self.total_epochs = total_epochs + self.warmup_start_lr = warmup_start_lr + self.start_lr = start_lr + self.end_lr = end_lr + self.cycles = cycles + super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose) + + def get_lr(self): + """ return lr value """ + if self.last_epoch < self.warmup_epochs: + val = (self.start_lr - self.warmup_start_lr) * float( + self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr + return val + + progress = float(self.last_epoch - self.warmup_epochs) / float( + max(1, self.total_epochs - self.warmup_epochs)) + val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) + val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr) + return val diff --git a/image_classification/ConvMixer/augment.py b/image_classification/ConvMixer/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/ConvMixer/augment.py +++ b/image_classification/ConvMixer/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/ConvMixer/config.py b/image_classification/ConvMixer/config.py index 056b58a1..8e54481a 100644 --- a/image_classification/ConvMixer/config.py +++ b/image_classification/ConvMixer/config.py @@ -92,36 +92,23 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" _C.SAVE_FREQ = 1 # freq to save chpt _C.REPORT_FREQ = 50 # freq to logging info -_C.VALIDATE_FREQ = 10 # freq to do validation +_C.VALIDATE_FREQ = 20 # freq to do validation _C.SEED = 0 _C.EVAL = False # run evaluation only -_C.AMP = False +_C.AMP = False # mix precision training _C.LOCAL_RANK = 0 _C.NGPUS = -1 @@ -155,6 +142,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/ConvMixer/datasets.py b/image_classification/ConvMixer/datasets.py index b120fa00..304df9a3 100644 --- a/image_classification/ConvMixer/datasets.py +++ b/image_classification/ConvMixer/datasets.py @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -99,9 +102,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/ConvMixer/droppath.py b/image_classification/ConvMixer/droppath.py new file mode 100644 index 00000000..c8fe8048 --- /dev/null +++ b/image_classification/ConvMixer/droppath.py @@ -0,0 +1,50 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth +""" + +import paddle +import paddle.nn as nn + +def drop_path(inputs, drop_prob=0., training=False): + """drop path op + Args: + input: tensor with arbitrary shape + drop_prob: float number of drop path probability, default: 0.0 + training: bool, if current mode is training, default: False + Returns: + output: output tensor after drop path + """ + # if prob is 0 or eval mode, return original input + if drop_prob == 0. or not training: + return inputs + keep_prob = 1 - drop_prob + keep_prob = paddle.to_tensor(keep_prob) + shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) + random_tensor = random_tensor.floor() # mask + output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation + return output + + +class DropPath(nn.Layer): + """DropPath class""" + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, inputs): + return drop_path(inputs, self.drop_prob, self.training) diff --git a/image_classification/ConvMixer/losses.py b/image_classification/ConvMixer/losses.py index f67780a2..082467a3 100644 --- a/image_classification/ConvMixer/losses.py +++ b/image_classification/ConvMixer/losses.py @@ -119,3 +119,5 @@ def forward(self, inputs, outputs, targets): loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha return loss + + diff --git a/image_classification/ConvMixer/main_multi_gpu.py b/image_classification/ConvMixer/main_multi_gpu.py index 0df90fd1..91ad7c7a 100644 --- a/image_classification/ConvMixer/main_multi_gpu.py +++ b/image_classification/ConvMixer/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,13 +29,11 @@ from datasets import get_dataset from utils import AverageMeter from utils import WarmupCosineScheduler -from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from convmixer import build_convmixer as build_model @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -105,11 +104,9 @@ def train(dataloader, local_logger: logger for local process/gpu, default: None master_logger: logger for main process, default: None Returns: - train_loss_meter.avg: float, average loss on current process/gpu - train_acc_meter.avg: float, average top1 accuracy on current process/gpu - master_train_loss_meter.avg: float, average loss on all processes/gpus - master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus - train_time: float, training time + train_loss_meter.avg + train_acc_meter.avg + train_time """ model.train() train_loss_meter = AverageMeter() @@ -426,8 +423,8 @@ def main_worker(*args): weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), + #apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") diff --git a/image_classification/ConvMixer/main_single_gpu.py b/image_classification/ConvMixer/main_single_gpu.py index 030dd784..92e246e2 100644 --- a/image_classification/ConvMixer/main_single_gpu.py +++ b/image_classification/ConvMixer/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,7 +34,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from convmixer import build_convmixer as build_model @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -333,10 +332,7 @@ def main(): beta2=config.TRAIN.OPTIMIZER.BETAS[1], weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, - grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), - ) + grad_clip=clip) else: logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -355,7 +351,7 @@ def main(): assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") diff --git a/image_classification/ConvMixer/random_erasing.py b/image_classification/ConvMixer/random_erasing.py index 80d31dd8..31eea465 100644 --- a/image_classification/ConvMixer/random_erasing.py +++ b/image_classification/ConvMixer/random_erasing.py @@ -83,15 +83,15 @@ def _erase(self, img, chan, img_h, img_w, dtype): dtype=dtype) break - def __call__(self, inputs): - if len(inputs.shape) == 3: - self._erase(inputs, *inputs.shape, inputs.dtype) + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) else: - batch_size, chan, img_h, img_w = inputs.shape + batch_size, chan, img_h, img_w = input.shape batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 for i in range(batch_start, batch_size): - self._erase(inputs[i], chan, img_h, img_w, inputs.dtype) - return inputs + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input diff --git a/image_classification/ConvMixer/transforms.py b/image_classification/ConvMixer/transforms.py index 676fe1ff..5a046912 100644 --- a/image_classification/ConvMixer/transforms.py +++ b/image_classification/ConvMixer/transforms.py @@ -1,3 +1,4 @@ +import random import paddle import paddle.nn import paddle.vision.transforms as T diff --git a/image_classification/ConvMixer/utils.py b/image_classification/ConvMixer/utils.py index 64d285a3..44800527 100644 --- a/image_classification/ConvMixer/utils.py +++ b/image_classification/ConvMixer/utils.py @@ -20,7 +20,6 @@ """ import math -import numpy as np from paddle.optimizer.lr import LRScheduler @@ -119,50 +118,3 @@ def get_lr(self): val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr) return val - -class OneCycleLRScheduler(LRScheduler): - """One Cycle Learning Rate Scheduler - - The scheduler adjusts learning rate in 3 stages. - First apply warmup from "max_lr/div_factor" to "max_lr", - than decrease lr from "max_lr" to "max_lr/div_factor". - Note that the two stages are symmetric. - In the third stage, decrease lr from "max_lr/div_factor" to "min_lr" - - Attributes: - learning_rate: the starting learning rate (without warmup), not used here! - max_lr: the maximum learning rate during training - total_steps: the total number of steps in the cycle - div_factor: determines the initial learning rate - via initial_lr = max_lr/div_factor (Default: 20) - min_lr: the learning rate after stage 3 (Default: 0) - pct_start: determines the step when stage 2 begins - via warm_up_steps = total_steps * pct_start - (Default: 0.4) - - Noted: Scheduler with default value is the same as original code - from https://github.com/tmp-iclr/convmixer - """ - def __init__(self, - learning_rate, - max_lr, - total_steps, - div_factor=20, - min_lr=0, - pct_start=0.4, - last_epoch=-1, - verbose=False - ): - self.max_lr = max_lr - self.min_lr = min_lr - self.initial_lr = max_lr / div_factor - self.total_steps = total_steps - self.warm_up_steps = int(total_steps * pct_start) - super().__init__(learning_rate, last_epoch, verbose) - - - def get_lr(self): - learning_rate = np.interp(self.last_epoch, - [0, self.warm_up_steps, self.warm_up_steps * 2, self.total_steps], - [self.initial_lr, self.max_lr, self.initial_lr, self.min_lr]) - return learning_rate diff --git a/image_classification/CrossViT/augment.py b/image_classification/CrossViT/augment.py new file mode 100644 index 00000000..7a7f081c --- /dev/null +++ b/image_classification/CrossViT/augment.py @@ -0,0 +1,285 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Augmentation""" +""" Rand Augmentation """ +# reference: RandAugment: Practical automated data augmentation with a reduced search space +# https://arxiv.org/abs/1909.13719 + +""" Auto Augmentation """ +# reference: AutoAugment: Learning Augmentation Policies from Data +# https://arxiv.org/abs/1805.09501 + +import random +import numpy as np +from PIL import Image, ImageEnhance, ImageOps + + +def auto_augment_policy_original(): + """25 types of augment policies in original paper""" + policy = [ + [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy] + return policy + + +def rand_augment_policy_original(magnitude_idx=9): + """ + 14 types of augment policies in original paper + Args: + magnitude_idx: M + """ + policy = [ + ('Posterize', 1, magnitude_idx), ('Rotate', 1, magnitude_idx), + ('Solarize', 1, magnitude_idx), ('AutoContrast', 1, magnitude_idx), + ('Equalize', 1, magnitude_idx), ('Contrast', 1, magnitude_idx), + ('Color', 1, magnitude_idx), ('Invert', 1, magnitude_idx), + ('Sharpness', 1, magnitude_idx), ('Brightness', 1, magnitude_idx), + ('ShearX', 1, magnitude_idx), ('ShearY', 1, magnitude_idx), + ('TranslateX', 1, magnitude_idx), ('TranslateY', 1, magnitude_idx), + ] + policy = [SubPolicy(*args) for args in policy] + return policy + + +class AutoAugment(): + """Auto Augment + Randomly choose a tuple of augment ops from a list of policy + Then apply the tuple of augment ops to input image + + Examples: + policy = auto_augment_policy_original() + augment = AutoAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy): + self.policy = policy + + def __call__(self, image, policy_idx=None): + if policy_idx is None: + policy_idx = random.randint(0, len(self.policy) - 1) + + sub_policy = self.policy[policy_idx] + for op in sub_policy: + image = op(image) + return image + + +class RandAugment(): + """Rand Augment + Randomly choose N augment ops from a list of K policies + Then apply the N ops to input image + + Examples: + policy = rand_augment_policy_original(magnitude_idx) + augment = RandAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy, num_layers=2): + """ + Args: + policy: list of SubPolicy + num_layers: int + """ + self.policy = policy + self.num_layers = num_layers + + def __call__(self, image): + selected_idx = np.random.choice(len(self.policy), self.num_layers) + + for policy_idx in selected_idx: + sub_policy = self.policy[policy_idx] + image = sub_policy(image) + return image + + +class SubPolicy: + """Subpolicy + Read augment name and magnitude, apply augment with probability + Args: + op_name: str, augment operation name + prob: float, if prob > random prob, apply augment + magnitude_idx: int, index of magnitude in preset magnitude ranges + """ + + def __init__(self, op_name, prob, magnitude_idx): + # ranges of operations' magnitude + ranges = { + 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'TranslateX': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'TranslateY': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'Rotate': np.linspace(0, 30, 10), # [-30, 30] (by random negative) + 'Color': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), # [0, 4] + 'Solarize': np.linspace(256, 0, 10), # [0, 256] + 'Contrast': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Sharpness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Brightness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'AutoContrast': [0] * 10, # no range + 'Equalize': [0] * 10, # no range + 'Invert': [0] * 10, # no range + } + + # augmentation operations + # Lambda is not pickleable for DDP + # image_ops = { + # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), + # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), + # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), + # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), + # 'Rotate': lambda image, magnitude: rotate(image, magnitude), + # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), + # 'Invert': lambda image, magnitude: invert(image, magnitude), + # 'Equalize': lambda image, magnitude: equalize(image, magnitude), + # 'Solarize': lambda image, magnitude: solarize(image, magnitude), + # 'Posterize': lambda image, magnitude: posterize(image, magnitude), + # 'Contrast': lambda image, magnitude: contrast(image, magnitude), + # 'Color': lambda image, magnitude: color(image, magnitude), + # 'Brightness': lambda image, magnitude: brightness(image, magnitude), + # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), + # } + image_ops = { + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_relative, + 'TranslateY': translate_y_relative, + 'Rotate': rotate, + 'AutoContrast': auto_contrast, + 'Invert': invert, + 'Equalize': equalize, + 'Solarize': solarize, + 'Posterize': posterize, + 'Contrast': contrast, + 'Color': color, + 'Brightness': brightness, + 'Sharpness': sharpness, + } + + self.prob = prob + self.magnitude = ranges[op_name][magnitude_idx] + self.op = image_ops[op_name] + + def __call__(self, image): + if self.prob > random.random(): + image = self.op(image, self.magnitude) + return image + + +# PIL Image transforms +# https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform +def shear_x(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor) + + +def shear_y(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor) + + +def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor) + + +def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor) + + +def rotate(image, magnitude): + rot = image.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new('RGBA', rot.size, (128,) * 4), + rot).convert(image.mode) + + +def auto_contrast(image, magnitude=None): + return ImageOps.autocontrast(image) + + +def invert(image, magnitude=None): + return ImageOps.invert(image) + + +def equalize(image, magnitude=None): + return ImageOps.equalize(image) + + +def solarize(image, magnitude): + return ImageOps.solarize(image, magnitude) + + +def posterize(image, magnitude): + return ImageOps.posterize(image, magnitude) + + +def contrast(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Contrast(image).enhance(1 + magnitude) + + +def color(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Color(image).enhance(1 + magnitude) + + +def brightness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Brightness(image).enhance(1 + magnitude) + + +def sharpness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Sharpness(image).enhance(1 + magnitude) + diff --git a/image_classification/CrossViT/config.py b/image_classification/CrossViT/config.py index b18287f0..133eb6f4 100644 --- a/image_classification/CrossViT/config.py +++ b/image_classification/CrossViT/config.py @@ -45,9 +45,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 -_C.MODEL.DROPOUT = 0.1 +_C.MODEL.DROPOUT = 0.0 _C.MODEL.DROPPATH = 0.1 -_C.MODEL.ATTENTION_DROPOUT = 0.1 +_C.MODEL.ATTENTION_DROPOUT = 0.0 # transformer settings _C.MODEL.TRANS = CN() @@ -71,13 +71,16 @@ _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 _C.TRAIN.NUM_EPOCHS = 300 -_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size -_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune -_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune -_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 -_C.TRAIN.END_LR = 5e-4 -_C.TRAIN.GRAD_CLIP = 1.0 -_C.TRAIN.ACCUM_ITER = 2 #1 +_C.TRAIN.WARMUP_EPOCHS = 30 +_C.TRAIN.WEIGHT_DECAY = 0.05 +_C.TRAIN.BASE_LR = 0.004 +_C.TRAIN.WARMUP_START_LR = 1e-6 +_C.TRAIN.END_LR = 1e-5 +_C.TRAIN.GRAD_CLIP = None +_C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.MODEL_EMA = True +_C.TRAIN.MODEL_EMA_DECAY = 0.99996 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -91,11 +94,33 @@ _C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) # for adamW _C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 +# train augmentation +_C.TRAIN.MIXUP_ALPHA = 0.8 # mixup alpha, enabled if >0 +_C.TRAIN.CUTMIX_ALPHA = 1.0 # cutmix alpha, enabled if >0 +_C.TRAIN.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha +_C.TRAIN.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled +_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled +_C.TRAIN.MIXUP_MODE = 'batch' # how to apply mixup/cutmix params, per 'batch', 'pair' or 'elem' + +_C.TRAIN.SMOOTHING = 0.1 +_C.TRAIN.COLOR_JITTER = 0.4 # color jitter factor +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True + +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 # random erase prob +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # random erase mode +_C.TRAIN.RANDOM_ERASE_COUNT = 1 # random erase count +_C.TRAIN.RANDOM_ERASE_SPLIT = False + +_C.TRAIN.DISTILLATION_TYPE = 'hard' # hard, soft, none +_C.TRAIN.DISTILLATION_ALPHA = 0.5 +_C.TRAIN.DISTILLATION_TAU = 1.0 + # misc _C.SAVE = "./output" _C.TAG = "default" _C.SAVE_FREQ = 10 # freq to save chpt -_C.REPORT_FREQ = 100 # freq to logging info +_C.REPORT_FREQ = 1 # freq to logging info _C.VALIDATE_FREQ = 100 # freq to do validation _C.SEED = 0 _C.EVAL = False # run evaluation only diff --git a/image_classification/CrossViT/crossvit.py b/image_classification/CrossViT/crossvit.py index f5e6381d..f59976b3 100755 --- a/image_classification/CrossViT/crossvit.py +++ b/image_classification/CrossViT/crossvit.py @@ -9,17 +9,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Cross ViT Class""" + import paddle import paddle.nn as nn import paddle.nn.functional as F from functools import partial +from t2t import T2T, get_sinusoid_encoding from crossvit_utils import * class PatchEmbed(nn.Layer): """ Image to Patch Embedding """ - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, multi_conv=False): super().__init__() img_size = to_2tuple(img_size) @@ -52,7 +54,7 @@ def forward(self, x): B, C, H, W = x.shape # FIXME look at relaxing size constraints assert H == self.img_size[0] and W == self.img_size[1], \ - f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + f"Input image size ({H}*{W}) doesn't match ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x).flatten(2).transpose((0, 2, 1)) return x @@ -63,16 +65,24 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0. super().__init__() self.num_heads = num_heads head_dim = dim // num_heads - # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 - self.wq = nn.Linear(dim, dim) - self.wk = nn.Linear(dim, dim) - self.wv = nn.Linear(dim, dim) + w_attr_1, b_attr_1 = self._init_weights() + self.wq = nn.Linear(dim, dim, weight_attr=w_attr_1, bias_attr=b_attr_1) + w_attr_2, b_attr_2 = self._init_weights() + self.wk = nn.Linear(dim, dim, weight_attr=w_attr_2, bias_attr=b_attr_2) + w_attr_3, b_attr_3 = self._init_weights() + self.wv = nn.Linear(dim, dim, weight_attr=w_attr_3, bias_attr=b_attr_3) self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) + w_attr_4, b_attr_4 = self._init_weights() + self.proj = nn.Linear(dim, dim, weight_attr=w_attr_4, bias_attr=b_attr_4) self.proj_drop = nn.Dropout(proj_drop) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): B, N, C = x.shape q = self.wq(x[:, 0:1, :]).reshape([B, 1, self.num_heads, C // self.num_heads]).transpose( @@ -86,20 +96,33 @@ def forward(self, x): attn = F.softmax(attn, axis=-1) attn = self.attn_drop(attn) - x = (attn @ v).transpose((0, 2, 1, 3)).reshape([B, 1, C]) # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C + # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C + x = (attn @ v).transpose((0, 2, 1, 3)).reshape([B, 1, C]) x = self.proj(x) x = self.proj_drop(x) return x class CrossAttentionBlock(nn.Layer): - def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, has_mlp=True): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + has_mlp=True): super(CrossAttentionBlock, self).__init__() - self.norm1 = norm_layer(dim) - self.attn = CrossAttention( - dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + w_attr_1, b_attr_1 = self._init_weights() + self.norm1 = nn.LayerNorm(dim, weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6) + self.attn = CrossAttention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() self.has_mlp = has_mlp if has_mlp: @@ -107,6 +130,11 @@ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, dropout=drop) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): x = x[:, 0:1, :] + self.drop_path(self.attn(self.norm1(x))) if self.has_mlp: @@ -116,9 +144,17 @@ def forward(self, x): class MultiScaleBlock(nn.Layer): - - def __init__(self, dim, patches, depth, num_heads, mlp_ratio, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., - drop_path=[], act_layer=nn.GELU, norm_layer=nn.LayerNorm): + def __init__(self, + dim, + patches, + depth, + num_heads, + mlp_ratio, + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=[]): super().__init__() num_branches = len(dim) @@ -129,9 +165,14 @@ def __init__(self, dim, patches, depth, num_heads, mlp_ratio, qkv_bias=False, qk tmp = [] for i in range(depth[d]): tmp.append( - Block(dim=dim[d], num_heads=num_heads[d], mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias, + Block(dim=dim[d], + num_heads=num_heads[d], + mlp_ratio=mlp_ratio[d], + qkv_bias=qkv_bias, qk_scale=qk_scale, - dropout=drop, attention_dropout=attn_drop, droppath=drop_path[i])) + dropout=drop, + attention_dropout=attn_drop, + droppath=drop_path[i])) if len(tmp) != 0: self.blocks.append(nn.Sequential(*tmp)) @@ -143,7 +184,14 @@ def __init__(self, dim, patches, depth, num_heads, mlp_ratio, qkv_bias=False, qk if dim[d] == dim[(d + 1) % num_branches] and False: tmp = [Identity()] else: - tmp = [norm_layer(dim[d]), act_layer(), nn.Linear(dim[d], dim[(d + 1) % num_branches])] + w_attr_1, b_attr_1 = self._init_weights_norm() + w_attr_2, b_attr_2 = self._init_weights_linear() + tmp = [nn.LayerNorm(dim[d], weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6), + nn.GELU(), + nn.Linear(dim[d], + dim[(d + 1) % num_branches], + weight_attr=w_attr_2, + bias_attr=b_attr_2)] self.projs.append(nn.Sequential(*tmp)) self.fusion = nn.LayerList() @@ -152,17 +200,26 @@ def __init__(self, dim, patches, depth, num_heads, mlp_ratio, qkv_bias=False, qk nh = num_heads[d_] if depth[-1] == 0: # backward capability: self.fusion.append( - CrossAttentionBlock(dim=dim[d_], num_heads=nh, mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias, + CrossAttentionBlock(dim=dim[d_], + num_heads=nh, + mlp_ratio=mlp_ratio[d], + qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop, attn_drop=attn_drop, drop_path=drop_path[-1], norm_layer=norm_layer, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[-1], has_mlp=False)) else: tmp = [] for _ in range(depth[-1]): - tmp.append(CrossAttentionBlock(dim=dim[d_], num_heads=nh, mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias, + tmp.append(CrossAttentionBlock(dim=dim[d_], + num_heads=nh, + mlp_ratio=mlp_ratio[d], + qkv_bias=qkv_bias, qk_scale=qk_scale, - drop=drop, attn_drop=attn_drop, drop_path=drop_path[-1], - norm_layer=norm_layer, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[-1], has_mlp=False)) self.fusion.append(nn.Sequential(*tmp)) @@ -171,10 +228,28 @@ def __init__(self, dim, patches, depth, num_heads, mlp_ratio, qkv_bias=False, qk if dim[(d + 1) % num_branches] == dim[d] and False: tmp = [Identity()] else: - tmp = [norm_layer(dim[(d + 1) % num_branches]), act_layer(), - nn.Linear(dim[(d + 1) % num_branches], dim[d])] + w_attr_1, b_attr_1 = self._init_weights_norm() + w_attr_2, b_attr_2 = self._init_weights_linear() + tmp = [nn.LayerNorm(dim[(d + 1) % num_branches], + weight_attr=w_attr_1, + bias_attr=w_attr_1), + nn.GELU(), + nn.Linear(dim[(d + 1) % num_branches], + dim[d], + weight_attr=w_attr_2, + bias_attr=b_attr_2)] self.revert_projs.append(nn.Sequential(*tmp)) + def _init_weights_norm(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + + def _init_weights_linear(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): outs_b = [block(x_) for x_, block in zip(x, self.blocks)] # only take the cls token out @@ -197,12 +272,22 @@ def _compute_num_patches(img_size, patches): class VisionTransformer(nn.Layer): """ Vision Transformer with support for patch or hybrid CNN input stage """ - - def __init__(self, img_size=(224, 224), patch_size=(8, 16), in_chans=3, num_classes=1000, embed_dim=(192, 384), + def __init__(self, + img_size=(224, 224), + patch_size=(8, 16), + in_chans=3, + num_classes=1000, + embed_dim=(192, 384), depth=([1, 3, 1], [1, 3, 1], [1, 3, 1]), - num_heads=(6, 12), mlp_ratio=(2., 2., 4.), qkv_bias=False, qk_scale=None, drop_rate=0., + num_heads=(6, 12), + mlp_ratio=(2., 2., 4.), + qkv_bias=False, + qk_scale=None, + drop_rate=0., attn_drop_rate=0., - drop_path_rate=0., hybrid_backbone=None, norm_layer=partial(nn.LayerNorm,epsilon=1e-6), multi_conv=False): + drop_path_rate=0., + hybrid_backbone=None, + multi_conv=False): super().__init__() self.num_classes = num_classes @@ -215,46 +300,83 @@ def __init__(self, img_size=(224, 224), patch_size=(8, 16), in_chans=3, num_clas self.patch_embed = nn.LayerList() if hybrid_backbone is None: self.pos_embed = nn.ParameterList( - [paddle.create_parameter(shape=[1, 1 + num_patches[i], embed_dim[i]], dtype='float32',default_initializer=nn.initializer.Constant(0.0)) for i in - range(self.num_branches)]) + [paddle.create_parameter( + shape=[1, 1 + num_patches[i], embed_dim[i]], + dtype='float32', + default_initializer=nn.initializer.Constant( + 0.0)) for i in range(self.num_branches)]) + for im_s, p, d in zip(img_size, patch_size, embed_dim): self.patch_embed.append( - PatchEmbed(img_size=im_s, patch_size=p, in_chans=in_chans, embed_dim=d, multi_conv=multi_conv)) + PatchEmbed(img_size=im_s, + patch_size=p, + in_chans=in_chans, + embed_dim=d, + multi_conv=multi_conv)) else: self.pos_embed = nn.ParameterList() - from .t2t import T2T, get_sinusoid_encoding tokens_type = 'transformer' if hybrid_backbone == 't2t' else 'performer' for idx, (im_s, p, d) in enumerate(zip(img_size, patch_size, embed_dim)): - self.patch_embed.append(T2T(im_s, tokens_type=tokens_type, patch_size=p, embed_dim=d)) + self.patch_embed.append( + T2T(im_s, tokens_type=tokens_type, patch_size=p, embed_dim=d)) self.pos_embed.append( - paddle.to_tensor(data=get_sinusoid_encoding(n_position=1 + num_patches[idx], d_hid=embed_dim[idx]), - dtype='flaot32', stop_gradient=False)) + paddle.to_tensor(data=get_sinusoid_encoding(n_position=1 + num_patches[idx], + d_hid=embed_dim[idx]), + dtype='flaot32', + stop_gradient=False)) del self.pos_embed - self.pos_embed = nn.ParameterList([paddle.to_tensor(paddle.zeros(1, 1 + num_patches[i], embed_dim[i]), - dtype='float32', stop_gradient=False) for i in - range(self.num_branches)]) + self.pos_embed = nn.ParameterList( + [paddle.to_tensor( + paddle.zeros(1, 1 + num_patches[i], embed_dim[i]), + dtype='float32', + stop_gradient=False) for i in range(self.num_branches)]) self.cls_token = nn.ParameterList( - [paddle.create_parameter(shape=[1, 1, embed_dim[i]], dtype='float32') for i in range(self.num_branches)]) + [paddle.create_parameter( + shape=[1, 1, embed_dim[i]], dtype='float32') for i in range(self.num_branches)]) self.pos_drop = nn.Dropout(p=drop_rate) total_depth = sum([sum(x[-2:]) for x in depth]) - dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, total_depth)] # stochastic depth decay rule + dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, total_depth)] dpr_ptr = 0 self.blocks = nn.LayerList() for idx, block_cfg in enumerate(depth): curr_depth = max(block_cfg[:-1]) + block_cfg[-1] dpr_ = dpr[dpr_ptr:dpr_ptr + curr_depth] - blk = MultiScaleBlock(embed_dim, num_patches, block_cfg, num_heads=num_heads, mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, - drop_path=dpr_, norm_layer=norm_layer) + blk = MultiScaleBlock(embed_dim, + num_patches, + block_cfg, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr_) dpr_ptr += curr_depth self.blocks.append(blk) - - self.norm = nn.LayerList([norm_layer(embed_dim[i]) for i in range(self.num_branches)]) + + + w_attr_1, b_attr_1 = self._init_weights_norm() + w_attr_2, b_attr_2 = self._init_weights_linear() + self.norm = nn.LayerList([nn.LayerNorm(embed_dim[i], + weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6) for i in range(self.num_branches)]) self.head = nn.LayerList( - [nn.Linear(embed_dim[i], num_classes) if num_classes > 0 else Identity() for i in range(self.num_branches)]) + [nn.Linear(embed_dim[i], + num_classes, + weight_attr=w_attr_2, + bias_attr=b_attr_2) if num_classes > 0 else Identity() for i in range(self.num_branches)]) + + def _init_weights_norm(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + + def _init_weights_linear(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr def no_weight_decay(self): out = {'cls_token'} @@ -273,7 +395,10 @@ def forward_features(self, x): B, C, H, W = x.shape xs = [] for i in range(self.num_branches): - x_ = paddle.nn.functional.interpolate(x, size=(self.img_size[i], self.img_size[i]), mode='bicubic') if H != self.img_size[i] else x + x_ = paddle.nn.functional.interpolate( + x, size=(self.img_size[i], + self.img_size[i]), + mode='bicubic') if H != self.img_size[i] else x tmp = self.patch_embed[i](x_) cls_tokens = self.cls_token[i].expand([B, -1, -1]) # stole cls_tokens impl from Phil Wang, thanks # print(cls_tokens.shape,tmp.shape) @@ -299,8 +424,10 @@ def forward(self, x): ce_logits = paddle.mean(paddle.stack(ce_logits, axis=0), axis=0) return ce_logits + def build_crossvit(config, **kwargs): model = VisionTransformer(img_size=config.MODEL.TRANS.IMG_SIZE, + num_classes=config.MODEL.NUM_CLASSES, patch_size=config.MODEL.TRANS.PATCH_SIZE, embed_dim=config.MODEL.TRANS.EMBED_DIM, depth=config.MODEL.TRANS.DEPTH, @@ -308,6 +435,8 @@ def build_crossvit(config, **kwargs): mlp_ratio=config.MODEL.TRANS.MLP_RATIO, qkv_bias=config.MODEL.TRANS.QKV_BIAS, multi_conv=config.MODEL.TRANS.MULTI_CONV, + drop_rate=config.MODEL.DROPOUT, + attn_drop_rate=config.MODEL.ATTENTION_DROPOUT, + drop_path_rate=config.MODEL.DROPPATH, **kwargs) return model - diff --git a/image_classification/CrossViT/crossvit_utils.py b/image_classification/CrossViT/crossvit_utils.py index de583554..2064ddfe 100755 --- a/image_classification/CrossViT/crossvit_utils.py +++ b/image_classification/CrossViT/crossvit_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/image_classification/CrossViT/datasets.py b/image_classification/CrossViT/datasets.py index 761dd61a..984e1fcf 100644 --- a/image_classification/CrossViT/datasets.py +++ b/image_classification/CrossViT/datasets.py @@ -19,8 +19,19 @@ import os import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from transforms import RandomHorizontalFlip +from random_erasing import RandomErasing class ImageNet2012Dataset(Dataset): @@ -61,7 +72,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') + data = Image.open(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -80,13 +91,36 @@ def get_train_transforms(config): Returns: transforms_train: training transforms """ - - transforms_train = transforms.Compose([ + aug_op_list = [] + # random crop and resize + aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) + scale=(0.05, 1.0))) + # auto_augment / color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + + transforms_train = transforms.Compose(aug_op_list) return transforms_train @@ -106,7 +140,7 @@ def get_val_transforms(config): scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) transforms_val = transforms.Compose([ - transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image + transforms.Resize(scale_size, 'bicubic'), transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), transforms.ToTensor(), transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), @@ -124,6 +158,7 @@ def get_dataset(config, mode='train'): Returns: dataset: dataset object """ + assert mode in ['train', 'val'] if config.DATA.DATASET == "cifar10": if mode == 'train': diff --git a/image_classification/CrossViT/losses.py b/image_classification/CrossViT/losses.py new file mode 100644 index 00000000..04377eac --- /dev/null +++ b/image_classification/CrossViT/losses.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Implement Loss functions """ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class LabelSmoothingCrossEntropyLoss(nn.Layer): + """ cross entropy loss for label smoothing + Args: + smoothing: float, label smoothing rate + x: tensor, predictions (default is before softmax) with shape [N, num_classes] as default + target: tensor, target label with shape [N] as default + weight: tensor, optional, a manual rescaling weight given to each class + reduction: str, optional, indicate how to average the loss by batch_size, + default is ``'mean'``, the candicates are ``'none'`` | ``'mean'`` | ``'sum'`` + axis: int, optional, the index of dimension to perform softmax calculations, + default is ``-1``, if `axis` is not -1 -> the shape of x and target may not be default + use_softmax: bool, optional, if `use_softmax` is ``False``, ``x`` should be after softmax, + default is ``True``, the candicates are ``True`` | ``False`` + name: str, optional, the name of the operator, default is ``None``, + for more information, please refer to :ref:`api_guide_Name`. + Return: + loss: float, cross entropy loss value + """ + def __init__(self, + smoothing=0.1, + weight=None, + reduction='mean', + axis=-1, + use_softmax=True, + name=None): + super().__init__() + assert 0 <= smoothing < 1.0 + self.smoothing = smoothing + self.weight = weight + self.reduction = reduction + self.axis = axis + self.use_softmax = use_softmax + self.name = name + + def forward(self, x, target): + target = paddle.nn.functional.one_hot(target, num_classes=x.shape[1]) + target = paddle.nn.functional.label_smooth(target, epsilon=self.smoothing) + loss = paddle.nn.functional.cross_entropy( + x, + target, + weight=self.weight, + reduction=self.reduction, + soft_label=True, + axis=self.axis, + use_softmax=self.use_softmax, + name=self.name) + return loss + + +class SoftTargetCrossEntropyLoss(nn.Layer): + """ cross entropy loss for soft target + Args: + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, soft target with shape [N, num_classes] + Returns: + loss: float, the mean loss value + """ + def __init__(self): + super().__init__() + + def forward(self, x, target): + loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1) + return loss.mean() + + +class DistillationLoss(nn.Layer): + """Distillation loss function + This layer includes the orginal loss (criterion) and a extra + distillation loss (criterion), which computes the loss with + different type options, between current model and + a teacher model as its supervision. + + Args: + base_criterion: nn.Layer, the original criterion + teacher_model: nn.Layer, the teacher model as supervision + distillation_type: str, one of ['none', 'soft', 'hard'] + alpha: float, ratio of base loss (* (1-alpha)) + and distillation loss( * alpha) + tao: float, temperature in distillation + """ + def __init__(self, + base_criterion, + teacher_model, + distillation_type, + alpha, + tau): + super().__init__() + assert distillation_type in ['none', 'soft', 'hard'] + self.base_criterion = base_criterion + self.teacher_model = teacher_model + self.type = distillation_type + self.alpha = alpha + self.tau = tau + + def forward(self, inputs, outputs, targets): + """ + Args: + inputs: tensor, the orginal model inputs + outputs: tensor, the outputs of the model + outputds_kd: tensor, the distillation outputs of the model, + this is usually obtained by a separate branch + in the last layer of the model + targets: tensor, the labels for the base criterion + """ + outputs, outputs_kd = outputs[0], outputs[1] + base_loss = self.base_criterion(outputs, targets) + if self.type == 'none': + return base_loss + + with paddle.no_grad(): + teacher_outputs = self.teacher_model(inputs) + + if self.type == 'soft': + distillation_loss = F.kl_div( + F.log_softmax(outputs_kd / self.tau, axis=1), + F.log_softmax(teacher_outputs / self.tau, axis=1), + reduction='sum') * (self.tau * self.tau) / outputs_kd.numel() + elif self.type == 'hard': + distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(axis=1)) + + loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha + return loss + + diff --git a/image_classification/CrossViT/main_multi_gpu.py b/image_classification/CrossViT/main_multi_gpu.py index 989f5b49..c46acc2c 100644 --- a/image_classification/CrossViT/main_multi_gpu.py +++ b/image_classification/CrossViT/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,53 +27,54 @@ import paddle.distributed as dist from datasets import get_dataloader from datasets import get_dataset -from crossvit import build_crossvit as build_model from utils import AverageMeter from utils import WarmupCosineScheduler +from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from model_ema import ModelEma +from crossvit import build_crossvit as build_model -parser = argparse.ArgumentParser('ViT') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -arguments = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, arguments) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('CrossViT') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -81,28 +82,43 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + model_ema=None, + mixup_fn=None, + amp=False, + local_logger=None, + master_logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch debug_steps: int, num of iters to log info, default: 100 accum_iter: int, num of iters for accumulating gradients, default: 1 + model_ema: ModelEma, model moving average instance + mixup_fn: Mixup, mixup instance, default: None amp: bool, if True, use mix precision training, default: False + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: - train_loss_meter.avg - train_acc_meter.avg - train_time + train_loss_meter.avg: float, average loss on current process/gpu + train_acc_meter.avg: float, average top1 accuracy on current process/gpu + master_train_loss_meter.avg: float, average loss on all processes/gpus + master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus + train_time: float, training time """ model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + master_train_loss_meter = AverageMeter() + master_train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() @@ -110,14 +126,17 @@ def train(dataloader, for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() @@ -127,7 +146,6 @@ def train(dataloader, #NOTE: division may be needed depending on the loss function # Here no division is needed: # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean' - # #loss = loss / accum_iter loss.backward() @@ -135,42 +153,86 @@ def train(dataloader, optimizer.step() optimizer.clear_grad() + if model_ema is not None and dist.get_rank() == 0: + model_ema.update(model) + pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) - batch_size = image.shape[0] - train_loss_meter.update(loss.numpy()[0], batch_size) - train_acc_meter.update(acc.numpy()[0], batch_size) + batch_size = paddle.to_tensor(image.shape[0]) - if batch_id % debug_steps == 0: - logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {train_loss_meter.avg:.4f}, " + - f"Avg Acc: {train_acc_meter.avg:.4f}") + # sync from other gpus for overall loss and acc + master_loss = loss.clone() + master_acc = acc.clone() + master_batch_size = batch_size.clone() + dist.all_reduce(master_loss) + dist.all_reduce(master_acc) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc = master_acc / dist.get_world_size() + master_train_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_train_acc_meter.update(master_acc.numpy()[0], master_batch_size.numpy()[0]) - train_time = time.time() - time_st - return train_loss_meter.avg, train_acc_meter.avg, train_time + train_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) + train_acc_meter.update(acc.numpy()[0], batch_size.numpy()[0]) + if batch_id % debug_steps == 0: + if local_logger: + local_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {train_loss_meter.avg:.4f}, " + + f"Avg Acc: {train_acc_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_train_loss_meter.avg:.4f}, " + + f"Avg Acc: {master_train_acc_meter.avg:.4f}") -def validate(dataloader, model, criterion, total_batch, debug_steps=100): + train_time = time.time() - time_st + return (train_loss_meter.avg, + train_acc_meter.avg, + master_train_loss_meter.avg, + master_train_acc_meter.avg, + train_time) + + +def validate(dataloader, + model, + criterion, + total_batch, + debug_steps=100, + local_logger=None, + master_logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + debug_steps: int, num of iters to log info, default: 100 + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + master_val_loss_meter.avg: float, average loss on all processes/gpus + master_val_acc1_meter.avg: float, average top1 accuracy on all processes/gpus + master_val_acc5_meter.avg: float, average top5 accuracy on all processes/gpus + val_time: float, validation time """ model.eval() val_loss_meter = AverageMeter() val_acc1_meter = AverageMeter() val_acc5_meter = AverageMeter() + master_val_loss_meter = AverageMeter() + master_val_acc1_meter = AverageMeter() + master_val_acc5_meter = AverageMeter() time_st = time.time() with paddle.no_grad(): @@ -185,63 +247,145 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1)) acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5) - dist.all_reduce(loss) - dist.all_reduce(acc1) - dist.all_reduce(acc5) - loss = loss / dist.get_world_size() - acc1 = acc1 / dist.get_world_size() - acc5 = acc5 / dist.get_world_size() - batch_size = paddle.to_tensor(image.shape[0]) - dist.all_reduce(batch_size) + + master_loss = loss.clone() + master_acc1 = acc1.clone() + master_acc5 = acc5.clone() + master_batch_size = batch_size.clone() + + dist.all_reduce(master_loss) + dist.all_reduce(master_acc1) + dist.all_reduce(master_acc5) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc1 = master_acc1 / dist.get_world_size() + master_acc5 = master_acc5 / dist.get_world_size() + + master_val_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc1_meter.update(master_acc1.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc5_meter.update(master_acc5.numpy()[0], master_batch_size.numpy()[0]) val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0]) val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0]) if batch_id % debug_steps == 0: - logger.info( - f"Val Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {val_loss_meter.avg:.4f}, " + - f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + - f"Avg Acc@5: {val_acc5_meter.avg:.4f}") - + if local_logger: + local_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {val_acc5_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {master_val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {master_val_acc5_meter.avg:.4f}") val_time = time.time() - time_st - return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time + return (val_loss_meter.avg, + val_acc1_meter.avg, + val_acc5_meter.avg, + master_val_loss_meter.avg, + master_val_acc1_meter.avg, + master_val_acc5_meter.avg, + val_time) def main_worker(*args): - # 0. Preparation + # STEP 0: Preparation + config = args[0] dist.init_parallel_env() last_epoch = config.TRAIN.LAST_EPOCH - world_size = paddle.distributed.get_world_size() - local_rank = paddle.distributed.get_rank() - logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + world_size = dist.get_world_size() + local_rank = dist.get_rank() seed = config.SEED + local_rank paddle.seed(seed) np.random.seed(seed) random.seed(seed) - # 1. Create model + # logger for each process/gpu + local_logger = get_logger( + filename=os.path.join(config.SAVE, 'log_{}.txt'.format(local_rank)), + logger_name='local_logger') + # overall logger + if local_rank == 0: + master_logger = get_logger( + filename=os.path.join(config.SAVE, 'log.txt'), + logger_name='master_logger') + master_logger.info(f'\n{config}') + else: + master_logger = None + local_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + if local_rank == 0: + master_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + + # STEP 1: Create model model = build_model(config) + # define model ema + model_ema = None + if not config.EVAL and config.TRAIN.MODEL_EMA and local_rank == 0: + model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY) model = paddle.DataParallel(model) - # 2. Create train and val dataloader + + # STEP 2: Create train and val dataloader dataset_train, dataset_val = args[1], args[2] # Create training dataloader if not config.EVAL: dataloader_train = get_dataloader(config, dataset_train, 'train', True) total_batch_train = len(dataloader_train) - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + local_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') if local_rank == 0: - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + master_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') # Create validation dataloader dataloader_val = get_dataloader(config, dataset_val, 'test', True) total_batch_val = len(dataloader_val) - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + local_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') if local_rank == 0: - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define optimizer and lr_scheduler + master_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -263,7 +407,9 @@ def main_worker(*args): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + local_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") if config.TRAIN.OPTIMIZER.NAME == "SGD": @@ -290,80 +436,132 @@ def main_worker(*args): weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip, - #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']), + #apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 5. Load pretrained model / load resumt model and optimizer states + # STEP 6: Load pretrained model / load resumt model and optimizer states if config.MODEL.PRETRAINED: if (config.MODEL.PRETRAINED).endswith('.pdparams'): raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) - logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + local_logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + if local_rank == 0: + master_logger.info( + f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) - logger.info( - f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}") + local_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + if local_rank == 0: + master_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + # load ema model + if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): + model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') + model_ema.module.set_state_dict(model_ema_state) + local_logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') + if local_rank == 0: + master_logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') - # 6. Validation + # STEP 7: Validation (eval mode) if config.EVAL: - logger.info('----- Start Validating') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info('----- Start Validating') + if local_rank == 0: + master_logger.info('----- Start Validating') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") return - # 6. Start training and validation - logging.info(f"Start training from epoch {last_epoch+1}.") + # STEP 8: Start training and validation (train mode) + local_logger.info(f"Start training from epoch {last_epoch+1}.") + if local_rank == 0: + master_logger.info(f"Start training from epoch {last_epoch+1}.") for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") - train_loss, train_acc, train_time = train(dataloader=dataloader_train, - model=model, - criterion=criterion, - optimizer=optimizer, - epoch=epoch, - total_batch=total_batch_train, - debug_steps=config.REPORT_FREQ, - accum_iter=config.TRAIN.ACCUM_ITER, - amp=config.AMP) + local_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + if local_rank == 0: + master_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + train_loss, train_acc, avg_loss, avg_acc, train_time = train( + dataloader=dataloader_train, + model=model, + criterion=criterion, + optimizer=optimizer, + epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, + total_batch=total_batch_train, + debug_steps=config.REPORT_FREQ, + accum_iter=config.TRAIN.ACCUM_ITER, + model_ema=model_ema, + mixup_fn=mixup_fn, + amp=config.AMP, + local_logger=local_logger, + master_logger=master_logger) + scheduler.step() - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Train Loss: {train_loss:.4f}, " + - f"Train Acc: {train_acc:.4f}, " + - f"time: {train_time:.2f}") + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {train_loss:.4f}, " + + f"Train Acc: {train_acc:.4f}, " + + f"time: {train_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {avg_loss:.4f}, " + + f"Train Acc: {avg_acc:.4f}, " + + f"time: {train_time:.2f}") + # validation if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: - logger.info(f'----- Validation after Epoch: {epoch}') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info(f'----- Validation after Epoch: {epoch}') + if local_rank == 0: + master_logger.info(f'----- Validation after Epoch: {epoch}') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") # model save if local_rank == 0: if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: @@ -371,18 +569,38 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - logger.info(f"----- Save model: {model_path}.pdparams") - logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") + if model_ema is not None: + model_ema_path = os.path.join( + config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA") + paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams') + master_logger.info(f"----- Save ema model: {model_ema_path}.pdparams") def main(): + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) + + # get dataset and start DDP if not config.EVAL: dataset_train = get_dataset(config, mode='train') else: dataset_train = None dataset_val = get_dataset(config, mode='val') config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS - dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS) + dist.spawn(main_worker, args=(config, dataset_train, dataset_val, ), nprocs=config.NGPUS) if __name__ == "__main__": diff --git a/image_classification/CrossViT/main_single_gpu.py b/image_classification/CrossViT/main_single_gpu.py index 0866dbda..6638172a 100644 --- a/image_classification/CrossViT/main_single_gpu.py +++ b/image_classification/CrossViT/main_single_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ import os import time import logging +import copy import argparse import random import numpy as np @@ -29,52 +30,52 @@ from crossvit import build_crossvit as build_model from utils import AverageMeter from utils import WarmupCosineScheduler +from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config - - -parser = argparse.ArgumentParser('CrossViT') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -args = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, args) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -#config.freeze() - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from model_ema import ModelEma +from crossvit import build_corssvit as build_model + + +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('CrossViT') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -82,28 +83,37 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + model_ema=None, + mixup_fn=None, + amp=False, + logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info - accum_iter: int, num of iters for accumulating gradients - amp: bool, if True, use mix precision training + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + accum_iter: int, num of iters for accumulating gradients, default: 1 + model_ema: ModelEma, model moving average instance + mixup_fn: Mixup, mixup instance, default: None + amp: bool, if True, use mix precision training, default: False + logger: logger for logging, default: None Returns: - train_loss_meter.avg - train_acc_meter.avg - train_time + train_loss_meter.avg: float, average loss on current process/gpu + train_acc_meter.avg: float, average top1 accuracy on current process/gpu + train_time: float, training time """ model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() @@ -112,20 +122,22 @@ def train(dataloader, for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - - else: - output = model(image) + else: # full precision training + output = model(image) # output[0]: class_token, output[1]: distill_token loss = criterion(output, label) #NOTE: division may be needed depending on the loss function # Here no division is needed: @@ -137,16 +149,23 @@ def train(dataloader, optimizer.step() optimizer.clear_grad() + if model_ema is not None: + model_ema.update(model) + + # average of output and kd_output, like model eval mode pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) batch_size = image.shape[0] train_loss_meter.update(loss.numpy()[0], batch_size) train_acc_meter.update(acc.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + f"Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {train_loss_meter.avg:.4f}, " + f"Avg Acc: {train_acc_meter.avg:.4f}") @@ -155,19 +174,20 @@ def train(dataloader, return train_loss_meter.avg, train_acc_meter.avg, train_time -def validate(dataloader, model, criterion, total_batch, debug_steps=100): +def validate(dataloader, model, criterion, total_batch, debug_steps=100, logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + logger: logger for logging, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + val_time: float, valitaion time """ model.eval() val_loss_meter = AverageMeter() @@ -192,7 +212,7 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): val_acc1_meter.update(acc1.numpy()[0], batch_size) val_acc5_meter.update(acc5.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( f"Val Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {val_loss_meter.avg:.4f}, " + @@ -204,25 +224,82 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): def main(): - # 0. Preparation + # STEP 0: Preparation + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) last_epoch = config.TRAIN.LAST_EPOCH seed = config.SEED paddle.seed(seed) np.random.seed(seed) random.seed(seed) - #paddle.set_device('gpu:0') - # 1. Create model + logger = get_logger(filename=os.path.join(config.SAVE, 'log.txt')) + logger.info(f'\n{config}') + + # STEP 1: Create model model = build_model(config) - # 2. Create train and val dataloader + # define model ema + model_ema = None + if not config.EVAL and config.TRAIN.MODEL_EMA: + model_ema = ModelEma(model, decay=config.TRAIN.MODEL_EMA_DECAY) + + # STEP 2: Create train and val dataloader if not config.EVAL: dataset_train = get_dataset(config, mode='train') dataloader_train = get_dataloader(config, dataset_train, 'train', False) dataset_val = get_dataset(config, mode='val') dataloader_val = get_dataloader(config, dataset_val, 'val', False) - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define lr_scheduler + + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from official code) + + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -244,9 +321,9 @@ def main(): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") - # 5. Define optimizer + if config.TRAIN.OPTIMIZER.NAME == "SGD": if config.TRAIN.GRAD_CLIP: clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP) @@ -266,18 +343,24 @@ def main(): optimizer = paddle.optimizer.AdamW( parameters=model.parameters(), learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR, - weight_decay=config.TRAIN.WEIGHT_DECAY, beta1=config.TRAIN.OPTIMIZER.BETAS[0], beta2=config.TRAIN.OPTIMIZER.BETAS[1], + weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, - grad_clip=clip) + grad_clip=clip, + #apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + # 'absolute_pos_embed', 'relative_position_bias_table']), + ) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 6. Load pretrained model or load resume model and optimizer states + + # STEP 6: Load pretrained model or load resume model and optimizer states if config.MODEL.PRETRAINED: - assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') - model_state = paddle.load(config.MODEL.PRETRAINED + '.pdparams') + if (config.MODEL.PRETRAINED).endswith('.pdparams'): + raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') + assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True + model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") @@ -289,36 +372,47 @@ def main(): opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( - f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") - # 7. Validation + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + # load ema model + if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): + model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') + model_ema.module.set_state_dict(model_ema_state) + logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') + + # STEP 7: Validation (eval mode) if config.EVAL: logger.info('----- Start Validating') val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + f"Validation Acc@5: {val_acc5:.4f}, " + f"time: {val_time:.2f}") return - # 8. Start training and validation - logging.info(f"Start training from epoch {last_epoch + 1}.") - for epoch in range(last_epoch + 1, config.TRAIN.NUM_EPOCHS + 1): + + # STEP 8: Start training and validation (train mode) + logger.info(f"Start training from epoch {last_epoch+1}.") + for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") train_loss, train_acc, train_time = train(dataloader=dataloader_train, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, total_batch=len(dataloader_train), debug_steps=config.REPORT_FREQ, accum_iter=config.TRAIN.ACCUM_ITER, + model_ema=model_ema, + mixup_fn=mixup_fn, amp=config.AMP, - ) + logger=logger) scheduler.step() logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Train Loss: {train_loss:.4f}, " + @@ -330,9 +424,10 @@ def main(): val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + @@ -346,6 +441,11 @@ def main(): paddle.save(optimizer.state_dict(), model_path + '.pdopt') logger.info(f"----- Save model: {model_path}.pdparams") logger.info(f"----- Save optim: {model_path}.pdopt") + if model_ema is not None: + model_ema_path = os.path.join( + config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}-EMA") + paddle.save(model_ema.state_dict(), model_ema_path + '.pdparams') + logger.info(f"----- Save ema model: {model_ema_path}.pdparams") if __name__ == "__main__": diff --git a/image_classification/CrossViT/mixup.py b/image_classification/CrossViT/mixup.py new file mode 100644 index 00000000..1d2db493 --- /dev/null +++ b/image_classification/CrossViT/mixup.py @@ -0,0 +1,225 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""mixup and cutmix for batch data""" +import numpy as np +import paddle + + +def rand_bbox(image_shape, lam, count=None): + """ CutMix bbox by lam value + Generate 1 random bbox by value lam. lam is the cut size rate. + The cut_size is computed by sqrt(1-lam) * image_size. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + count: int, number of bbox to generate + """ + image_h, image_w = image_shape[-2:] + cut_rate = np.sqrt(1. - lam) + cut_h = int(cut_rate * image_h) + cut_w = int(cut_rate * image_w) + + # get random bbox center + cy = np.random.randint(0, image_h, size=count) + cx = np.random.randint(0, image_w, size=count) + + # get bbox coords + bbox_x1 = np.clip(cx - cut_w // 2, 0, image_w) + bbox_y1 = np.clip(cy - cut_h // 2, 0, image_h) + bbox_x2 = np.clip(cx + cut_w // 2, 0, image_w) + bbox_y2 = np.clip(cy + cut_h // 2, 0, image_h) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # while in pytorch, it will return [] tensor + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def rand_bbox_minmax(image_shape, minmax, count=None): + """ CutMix bbox by min and max value + Generate 1 random bbox by min and max percentage values. + Minmax is a tuple/list of min and max percentage vlaues + applied to the image width and height. + + Args: + image_shape: tuple/list, image height and width + minmax: tuple/list, min and max percentage values of image size + count: int, number of bbox to generate + """ + assert len(minmax) == 2 + image_h, image_w = image_shape[-2:] + min_ratio = minmax[0] + max_ratio = minmax[1] + cut_h = np.random.randint(int(image_h * min_ratio), int(image_h * max_ratio), size=count) + cut_w = np.random.randint(int(image_w * min_ratio), int(image_w * max_ratio), size=count) + + bbox_x1 = np.random.randint(0, image_w - cut_w, size=count) + bbox_y1 = np.random.randint(0, image_h - cut_h, size=count) + bbox_x2 = bbox_x1 + cut_w + bbox_y2 = bbox_y1 + cut_h + + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def cutmix_generate_bbox_adjust_lam(image_shape, lam, minmax=None, correct_lam=True, count=None): + """Generate bbox and apply correction for lambda + If the mimmax is None, apply the standard cutmix by lam value, + If the minmax is set, apply the cutmix by min and max percentage values. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + minmax: tuple/list, min and max percentage values of image size + correct_lam: bool, if True, correct the lam value by the generated bbox + count: int, number of bbox to generate + """ + if minmax is not None: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox_minmax(image_shape, minmax, count) + else: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox(image_shape, lam, count) + + if correct_lam or minmax is not None: + image_h, image_w = image_shape[-2:] + bbox_area = (bbox_y2 - bbox_y1) * (bbox_x2 - bbox_x1) + lam = 1. - bbox_area / float(image_h * image_w) + return (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam + + +def one_hot(x, num_classes, on_value=1., off_value=0.): + """ Generate one-hot vector for label smoothing + Args: + x: tensor, contains label/class indices + num_classes: int, num of classes (len of the one-hot vector) + on_value: float, the vector value at label index, default=1. + off_value: float, the vector value at non-label indices, default=0. + Returns: + one_hot: tensor, tensor with on value at label index and off value + at non-label indices. + """ + x = x.reshape_([-1, 1]) + x_smoothed = paddle.full((x.shape[0], num_classes), fill_value=off_value) + for i in range(x.shape[0]): + x_smoothed[i, x[i]] = on_value + return x_smoothed + + +def mixup_one_hot(label, num_classes, lam=1., smoothing=0.): + """ mixup and label smoothing in batch + label smoothing is firstly applied, then + mixup is applied by mixing the bacth and its flip, + with a mixup rate. + + Args: + label: tensor, label tensor with shape [N], contains the class indices + num_classes: int, num of all classes + lam: float, mixup rate, default=1.0 + smoothing: float, label smoothing rate + """ + off_value = smoothing / num_classes + on_value = 1. - smoothing + off_value + y1 = one_hot(label, num_classes, on_value, off_value) + y2 = one_hot(label.flip(axis=[0]), num_classes, on_value, off_value) + return y2 * (1 - lam) + y1 * lam + + +class Mixup: + """Mixup class + Args: + mixup_alpha: float, mixup alpha for beta distribution, default=1.0, + cutmix_alpha: float, cutmix alpha for beta distribution, default=0.0, + cutmix_minmax: list/tuple, min and max value for cutmix ratio, default=None, + prob: float, if random prob < prob, do not use mixup, default=1.0, + switch_prob: float, prob of switching mixup and cutmix, default=0.5, + mode: string, mixup up, now only 'batch' is supported, default='batch', + correct_lam: bool, if True, apply correction of lam, default=True, + label_smoothing: float, label smoothing rate, default=0.1, + num_classes: int, num of classes, default=1000 + """ + def __init__(self, + mixup_alpha=1.0, + cutmix_alpha=0.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode='batch', + correct_lam=True, + label_smoothing=0.1, + num_classes=1000): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if cutmix_minmax is not None: + assert len(cutmix_minmax) == 2 + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = correct_lam + assert mode == 'batch', 'Now only batch mode is supported!' + + def __call__(self, x, target): + assert x.shape[0] % 2 == 0, "Batch size should be even" + lam = self._mix_batch(x) + target = mixup_one_hot(target, self.num_classes, lam, self.label_smoothing) + return x, target + + def get_params(self): + """Decide to use cutmix or regular mixup by sampling and + sample lambda for mixup + """ + lam = 1. + use_cutmix = False + use_mixup = np.random.rand() < self.mix_prob + if use_mixup: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand() < self.switch_prob + alpha = self.cutmix_alpha if use_cutmix else self.mixup_alpha + lam_mix = np.random.beta(alpha, alpha) + elif self.mixup_alpha == 0. and self.cutmix_alpha > 0.: + use_cutmix=True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + elif self.mixup_alpha > 0. and self.cutmix_alpha == 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + else: + raise ValueError('mixup_alpha and cutmix_alpha cannot be all 0') + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_batch(self, x): + """mixup/cutmix by adding batch data and its flipped version""" + lam, use_cutmix = self.get_params() + if lam == 1.: + return lam + if use_cutmix: + (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam = cutmix_generate_bbox_adjust_lam( + x.shape, + lam, + minmax=self.cutmix_minmax, + correct_lam=self.correct_lam) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # but in pytorch, it will return [] tensor without errors + if int(bbox_x1) != int(bbox_x2) and int(bbox_y1) != int(bbox_y2): + x[:, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] = x.flip(axis=[0])[ + :, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] + else: + x_flipped = x.flip(axis=[0]) + x_flipped = x_flipped * (1 - lam) + x.set_value(x * (lam) + x_flipped) + return lam diff --git a/image_classification/CrossViT/model_ema.py b/image_classification/CrossViT/model_ema.py new file mode 100644 index 00000000..8a636765 --- /dev/null +++ b/image_classification/CrossViT/model_ema.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Implement the Exponential Model Averaging +This is paddle hack from: +https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py +""" + +import copy +from collections import OrderedDict +import paddle +import paddle.nn as nn + + +class ModelEma: + """Model Ema + A moving average is kept of model weights and buffers. + Note that for multiple gpu, ema must be defined after mode init, + but before DataParallel. + + Args: + model: nn.Layer, original modela with learnable params + decay: float, decay rate for each update, default: 0.999 + """ + def __init__(self, model, decay=0.999): + self.module = copy.deepcopy(model) + self.module.eval() + self.decay = decay + + @paddle.no_grad() + def _update(self, model, update_fn): + # update ema model parameters by model parameters + for (_, ema_param), (_, model_param) in zip( + self.module.named_parameters(), model.named_parameters()): + ema_param.set_value(copy.deepcopy(update_fn(ema_param, model_param))) + + # update ema model buffers by model buffers + for (_, ema_buf), (_, model_buf) in zip( + self.module.named_buffers(), model.named_buffers()): + ema_buf.set_value(copy.deepcopy(update_fn(ema_buf, model_buf))) + + def update(self, model): + self._update(model, update_fn=lambda e, m: self.decay * e + (1 - self.decay) * m) + + def set(self, model): + self._update(model, update_fn=lambda e, m: m) + + def state_dict(self): + return self.module.state_dict() + diff --git a/image_classification/CrossViT/demo.py b/image_classification/CrossViT/port_weights/demo.py similarity index 100% rename from image_classification/CrossViT/demo.py rename to image_classification/CrossViT/port_weights/demo.py diff --git a/image_classification/CrossViT/random_erasing.py b/image_classification/CrossViT/random_erasing.py new file mode 100644 index 00000000..31eea465 --- /dev/null +++ b/image_classification/CrossViT/random_erasing.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Random Erasing for image tensor""" + +import random +import math +import paddle + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"): + if per_pixel: + return paddle.normal(shape=patch_size).astype(dtype) + if rand_color: + return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype) + return paddle.zeros((patch_size[0], 1, 1)).astype(dtype) + + +class RandomErasing(object): + """ + Args: + prob: probability of performing random erasing + min_area: Minimum percentage of erased area wrt input image area + max_area: Maximum percentage of erased area wrt input image area + min_aspect: Minimum aspect ratio of earsed area + max_aspect: Maximum aspect ratio of earsed area + mode: pixel color mode, in ['const', 'rand', 'pixel'] + 'const' - erase block is constant valued 0 for all channels + 'rand' - erase block is valued random color (same per-channel) + 'pixel' - erase block is vauled random color per pixel + min_count: Minimum # of ereasing blocks per image. + max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count + per-image count is randomly chosen between min_count to max_count + """ + def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0): + self.prob = prob + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == "rand": + self.rand_color = True + elif mode == "pixel": + self.per_pixel = True + else: + assert not mode or mode == "const" + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.prob: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top+h, left:left+w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype) + break + + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) + else: + batch_size, chan, img_h, img_w = input.shape + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input + + + +#def main(): +# re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel') +# import PIL.Image as Image +# import numpy as np +# paddle.set_device('cpu') +# img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32') +# img = img / 255.0 +# img = paddle.transpose(img, [2, 0, 1]) +# new_img = re(img) +# new_img = new_img * 255.0 +# new_img = paddle.transpose(new_img, [1, 2, 0]) +# new_img = new_img.cpu().numpy() +# new_img = Image.fromarray(new_img.astype('uint8')) +# new_img.save('./res.png') +# +# +# +#if __name__ == "__main__": +# main() diff --git a/image_classification/CrossViT/run_eval_tiny_224.sh b/image_classification/CrossViT/run_eval_tiny_224.sh index 67ac07e3..c4f211c3 100644 --- a/image_classification/CrossViT/run_eval_tiny_224.sh +++ b/image_classification/CrossViT/run_eval_tiny_224.sh @@ -1,5 +1,5 @@ -CUDA_VISIBLE_DEVICES=0 \ -python main_single_gpu.py \ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +python main_multi_gpu.py \ -cfg='./configs/crossvit_tiny_224.yaml' \ -dataset='imagenet2012' \ -batch_size=128 \ diff --git a/image_classification/CrossViT/run_train_multi_tiny_224.sh b/image_classification/CrossViT/run_train_multi_tiny_224.sh index 76665314..c97a88b7 100644 --- a/image_classification/CrossViT/run_train_multi_tiny_224.sh +++ b/image_classification/CrossViT/run_train_multi_tiny_224.sh @@ -1,7 +1,7 @@ -CUDA_VISIBLE_DEVICES=0,1,2,3 \ +CUDA_VISIBLE_DEVICES=4,5,6,7 \ python main_multi_gpu.py \ -cfg='./configs/crossvit_tiny_224.yaml' \ -dataset='imagenet2012' \ -batch_size=8 \ -data_path='/dataset/imagenet' \ --amp +#-amp diff --git a/image_classification/CrossViT/t2t.py b/image_classification/CrossViT/t2t.py index 11d69b32..516abfa5 100755 --- a/image_classification/CrossViT/t2t.py +++ b/image_classification/CrossViT/t2t.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,8 +16,6 @@ import paddle import paddle.nn as nn from crossvit_utils import DropPath, Identity, to_2tuple -import paddlenlp - def get_sinusoid_encoding(n_position, d_hid): ''' Sinusoid position encoding table ''' @@ -37,18 +35,24 @@ def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.1, dp2=0.1): # def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.0, dp2=0.0): super().__init__() self.emb = in_dim * head_cnt # we use 1, so it is no need here - self.kqv = nn.Linear(dim, 3 * self.emb) + w_attr_1, b_attr_1 = self._init_weights() + self.kqv = nn.Linear(dim, 3 * self.emb, weight_attr=w_attr_1, bias_attr=b_attr_1) self.dp = nn.Dropout(dp1) - self.proj = nn.Linear(self.emb, self.emb) + w_attr_2, b_attr_2 = self._init_weights() + self.proj = nn.Linear(self.emb, self.emb, weight_attr=w_attr_2, bias_attr=b_attr_2) self.head_cnt = head_cnt - self.norm1 = nn.LayerNorm(dim) - self.norm2 = nn.LayerNorm(self.emb) + w_attr_3, b_attr_3 = self._init_weights_norm() + w_attr_4, b_attr_4 = self._init_weights_norm() + self.norm1 = nn.LayerNorm(dim, weight_attr=w_attr_3, bias_attr=b_attr_3) + self.norm2 = nn.LayerNorm(self.emb, weight_attr=w_attr_4, bias_attr=b_attr_4) self.epsilon = 1e-8 # for stable in division + w_attr_5, b_attr_5 = self._init_weights() + w_attr_6, b_attr_6 = self._init_weights() self.mlp = nn.Sequential( - nn.Linear(self.emb, 1 * self.emb), + nn.Linear(self.emb, 1 * self.emb, weight_attr=w_attr_5, bias_attr=b_attr_5), nn.GELU(), - nn.Linear(1 * self.emb, self.emb), + nn.Linear(1 * self.emb, self.emb, weight_attr=w_attr_6, bias_attr=b_attr_6), nn.Dropout(dp2), ) @@ -57,18 +61,32 @@ def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.1, dp2=0.1): # todo wait implement # self.w = nn.Parameter(nn.init.orthogonal_(self.w) * math.sqrt(self.m), requires_grad=False) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + + def _init_weights_norm(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def prm_exp(self, x): xd = ((x * x).sum(dim=-1, keepdim=True)).repeat(1, 1, self.m) / 2 - wtx = paddlenlp.ops.einsum('bti,mi->btm', x.float(), self.w) + wtx = paddle.matmul(x.float(), self.w, transpose_y=True) + #wtx = paddlenlp.ops.einsum('bti,mi->btm', x.float(), self.w) return paddle.exp(wtx - xd) / math.sqrt(self.m) def single_attn(self, x): k, q, v = paddle.split(self.kqv(x), self.emb, axis=-1) kp, qp = self.prm_exp(k), self.prm_exp(q) - D = paddlenlp.ops.einsum('bti,bi->bt', qp, kp.sum(dim=1)).unsqueeze(dim=2) - kptv = paddlenlp.ops.einsum('bin,bim->bnm', v.float(), kp) # (B, emb, m) - y = paddlenlp.ops.einsum('bti,bni->btn', qp, kptv) / (D.repeat(1, 1, self.emb) + self.epsilon) + D = paddle.matmul(qp, kp.sum(dim=1)).unsqueeze(dim=2) + #D = paddlenlp.ops.einsum('bti,bi->bt', qp, kp.sum(dim=1)).unsqueeze(dim=2) + kptv = paddle.matmul(v.float(), kp, transpose_x=True) + #kptv = paddlenlp.ops.einsum('bin,bim->bnm', v.float(), kp) # (B, emb, m) + y = paddle.matmul(qp, kptv, transpose_y=True) / (D.repeat(1, 1, self.emb) + self.epsilon) + #y = paddlenlp.ops.einsum('bti,bni->btn', qp, kptv) / (D.repeat(1, 1, self.emb) + self.epsilon) # skip connection y = v + self.dp(self.proj(y)) @@ -85,11 +103,18 @@ def __init__(self, in_features, hidden_features=None, out_features=None, act_lay super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) + w_attr_1, b_attr_1 = self._init_weights() + self.fc1 = nn.Linear(in_features, hidden_features, weight_attr=w_attr_1, bias_attr=b_attr_1) self.act = act_layer() - self.fc2 = nn.Linear(hidden_features, out_features) + w_attr_2, b_attr_2 = self._init_weights() + self.fc2 = nn.Linear(hidden_features, out_features, weight_attr=w_attr_2, bias_attr=b_attr_2) self.drop = nn.Dropout(drop) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): x = self.fc1(x) x = self.act(x) @@ -107,11 +132,20 @@ def __init__(self, dim, num_heads=8, in_dim=None, qkv_bias=False, qk_scale=None, head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 - self.qkv = nn.Linear(dim, in_dim * 3) + w_attr_1, b_attr_1 = self._init_weights() + self.qkv = nn.Linear(dim, in_dim * 3, weight_attr=w_attr_1, bias_attr=b_attr_1) + self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(in_dim, in_dim) + w_attr_2, b_attr_2 = self._init_weights() + self.proj = nn.Linear(in_dim, in_dim, weight_attr=w_attr_2, bias_attr=b_attr_2) + self.proj_drop = nn.Dropout(proj_drop) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): B, N, C = x.shape @@ -137,7 +171,8 @@ class Token_transformer(nn.Layer): def __init__(self, dim, in_dim, num_heads, mlp_ratio=1., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() - self.norm1 = norm_layer(dim) + w_attr_1, b_attr_1 = self._init_weights_norm() + self.norm1 = norm_layer(dim, weight_attr=w_attr_1, bias_attr=b_attr_1) self.attn = Attention(dim, in_dim=in_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() @@ -145,6 +180,11 @@ def __init__(self, dim, in_dim, num_heads, mlp_ratio=1., qkv_bias=False, qk_scal self.mlp = Mlp(in_features=in_dim, hidden_features=int(in_dim * mlp_ratio), out_features=in_dim, act_layer=act_layer, drop=drop) + def _init_weights_norm(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): x = self.attn(self.norm1(x)) x = x + self.drop_path(self.mlp(self.norm2(x))) @@ -179,19 +219,32 @@ def __init__(self, img_size=224, patch_size=16, tokens_type='transformer', in_ch mlp_ratio=1.0) self.attention2 = Token_transformer(dim=token_dim * (kernel_size[1][0] ** 2), in_dim=token_dim, num_heads=1, mlp_ratio=1.0) - self.project = nn.Linear(token_dim * (kernel_size[2][0] ** 2), embed_dim) + w_attr_1, b_attr_1 = self._init_weights() + self.project = nn.Linear(token_dim * (kernel_size[2][0] ** 2), + embed_dim, + weight_attr=w_attr_1, + bias_attr=b_attr_1) elif tokens_type == 'performer': self.attention1 = Token_performer(dim=in_chans * (kernel_size[0][0] ** 2), in_dim=token_dim, kernel_ratio=0.5) self.attention2 = Token_performer(dim=token_dim * (kernel_size[1][0] ** 2), in_dim=token_dim, kernel_ratio=0.5) - self.project = nn.Linear(token_dim * (kernel_size[2][0] ** 2), embed_dim) + w_attr_1, b_attr_1 = self._init_weights() + self.project = nn.Linear(token_dim * (kernel_size[2][0] ** 2), + embed_dim, + weight_attr=w_attr_1, + bias_attr=b_attr_1) self.num_patches = (img_size // (kernel_size[0][1] * kernel_size[1][1] * kernel_size[2][1])) * (img_size // ( kernel_size[0][1] * kernel_size[1][1] * kernel_size[2][ 1])) # there are 3 sfot split, stride are 4,2,2 seperately + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): # step0: soft split x = self.soft_split0(x).transpose(1, 2) @@ -244,11 +297,20 @@ def __init__(self, img_size=224, patch_size=16, tokens_type='transformer', in_ch mlp_ratio=1.0) self.attention2 = Token_transformer(dim=token_dim * (kernel_size[1][0] ** 2), in_dim=token_dim, num_heads=1, mlp_ratio=1.0) - self.project = nn.Linear(token_dim * (kernel_size[2][0] ** 2), embed_dim) + w_attr_1, b_attr_1 = self._init_weights() + self.project = nn.Linear(token_dim * (kernel_size[2][0] ** 2), + embed_dim, + weight_attr=w_attr_1, + bias_attr=b_attr_1) self.num_patches = (img_size // (kernel_size[0][1] * kernel_size[1][1] * kernel_size[2][1])) * (img_size // ( kernel_size[0][1] * kernel_size[1][1] * kernel_size[2][1])) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): # step0: soft split x = self.soft_split0(x).transpose(1, 2) diff --git a/image_classification/CrossViT/transforms.py b/image_classification/CrossViT/transforms.py new file mode 100644 index 00000000..5a046912 --- /dev/null +++ b/image_classification/CrossViT/transforms.py @@ -0,0 +1,14 @@ +import random +import paddle +import paddle.nn +import paddle.vision.transforms as T + + +class RandomHorizontalFlip(): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, image): + if random.random() < self.p: + return T.hflip(image) + return image diff --git a/image_classification/CvT/augment.py b/image_classification/CvT/augment.py index 3d27120d..7a7f081c 100644 --- a/image_classification/CvT/augment.py +++ b/image_classification/CvT/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy @@ -282,3 +282,4 @@ def brightness(image, magnitude): def sharpness(image, magnitude): magnitude = magnitude * random.choice([-1, 1]) # random negative return ImageEnhance.Sharpness(image).enhance(1 + magnitude) + diff --git a/image_classification/CvT/config.py b/image_classification/CvT/config.py index 71965789..ae43eb57 100644 --- a/image_classification/CvT/config.py +++ b/image_classification/CvT/config.py @@ -65,12 +65,14 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 5 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.BASE_LR = 0.02 _C.TRAIN.WARMUP_START_LR = 2e-6 _C.TRAIN.END_LR = 2e-5 _C.TRAIN.GRAD_CLIP = None _C.TRAIN.ACCUM_ITER = 1 -_C.TRAIN.LINEAR_SCALED_LR = 512 +_C.TRAIN.MODEL_EMA = False +_C.TRAIN.MODEL_EMA_DECAY = 0.99992 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -93,28 +95,15 @@ _C.TRAIN.MIXUP_MODE = 'batch' _C.TRAIN.SMOOTHING = 0.1 -_C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.COLOR_JITTER = 0.4 # color jitter factor +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False -_C.TRAIN.RANDOM_ERASE_PROB = 0.25 -_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' -_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 # random erase prob +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # random erase mode +_C.TRAIN.RANDOM_ERASE_COUNT = 1 # random erase count _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" diff --git a/image_classification/CvT/cvt.py b/image_classification/CvT/cvt.py index 784b04b5..5d4c0915 100644 --- a/image_classification/CvT/cvt.py +++ b/image_classification/CvT/cvt.py @@ -67,9 +67,9 @@ def __init__(self, def _init_weights(self): weight_attr = paddle.ParamAttr( - initializer=nn.initializer.XavierUniform()) + initializer=nn.initializer.TruncatedNormal(std=.02)) bias_attr = paddle.ParamAttr( - initializer=nn.initializer.Normal(std=1e-6)) + initializer=nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -180,15 +180,24 @@ def __init__(self, ) # init parameters of q,k,v - self.proj_q = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias) - self.proj_k = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias) - self.proj_v = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias) + w_attr_1, b_attr_1 = self._init_weights() + w_attr_2, b_attr_2 = self._init_weights() + w_attr_3, b_attr_3 = self._init_weights() + self.proj_q = nn.Linear(dim_in, dim_out, weight_attr=w_attr_1, bias_attr=b_attr_1 if qkv_bias else False) + self.proj_k = nn.Linear(dim_in, dim_out, weight_attr=w_attr_2, bias_attr=b_attr_2 if qkv_bias else False) + self.proj_v = nn.Linear(dim_in, dim_out, weight_attr=w_attr_3, bias_attr=b_attr_3 if qkv_bias else False) # init project other parameters self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim_out, dim_out) + w_attr_4, b_attr_4 = self._init_weights() + self.proj = nn.Linear(dim_out, dim_out, weight_attr=w_attr_4, bias_attr=b_attr_4) self.proj_drop = nn.Dropout(proj_drop) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def _build_projection(self, dim_in, dim_out, diff --git a/image_classification/CvT/datasets.py b/image_classification/CvT/datasets.py index 833904ed..ec2f82ed 100644 --- a/image_classification/CvT/datasets.py +++ b/image_classification/CvT/datasets.py @@ -28,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from random_erasing import RandomErasing @@ -69,7 +71,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') + data = Image.open(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -99,10 +101,14 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: - jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) - # STEP3: other ops + jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD)) diff --git a/image_classification/CvT/main_multi_gpu.py b/image_classification/CvT/main_multi_gpu.py index 22822b56..d1eef8cf 100644 --- a/image_classification/CvT/main_multi_gpu.py +++ b/image_classification/CvT/main_multi_gpu.py @@ -324,9 +324,9 @@ def main_worker(*args): if not config.EVAL: dataloader_train = get_dataloader(config, dataset_train, 'train', True) total_batch_train = len(dataloader_train) - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + local_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') if local_rank == 0: - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + master_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') # Create validation dataloader dataloader_val = get_dataloader(config, dataset_val, 'test', True) total_batch_val = len(dataloader_val) diff --git a/image_classification/CvT/run_train_multi.sh b/image_classification/CvT/run_train_multi.sh index fb76898b..3f3aef29 100644 --- a/image_classification/CvT/run_train_multi.sh +++ b/image_classification/CvT/run_train_multi.sh @@ -4,4 +4,4 @@ python main_multi_gpu.py \ -dataset='imagenet2012' \ -batch_size=16 \ -data_path='/dataset/imagenet' \ - -amp \ No newline at end of file + #-amp diff --git a/image_classification/CycleMLP/augment.py b/image_classification/CycleMLP/augment.py index aade134e..7a7f081c 100644 --- a/image_classification/CycleMLP/augment.py +++ b/image_classification/CycleMLP/augment.py @@ -28,37 +28,37 @@ def auto_augment_policy_original(): """25 types of augment policies in original paper""" policy = [ - [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], - [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], - [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], - [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], - [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], - [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], - [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], - [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], - [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], - [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], - [('Rotate', 0.8, 8), ('Color', 0.4, 0)], - [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], - [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], - [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], - [('Color', 0.6, 4), ('Contrast', 1.0, 8)], - [('Rotate', 0.8, 8), ('Color', 1.0, 2)], - [('Color', 0.8, 8), ('Solarize', 0.8, 7)], - [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], - [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], - [('Color', 0.4, 0), ('Equalize', 0.6, 3)], - [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], - [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], - [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], - [('Color', 0.6, 4), ('Contrast', 1.0, 8)], - [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], ] policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy] return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -87,12 +87,13 @@ class AutoAugment(): augment = AutoAugment(policy) transformed_image = augment(image) """ + def __init__(self, policy): self.policy = policy - + def __call__(self, image, policy_idx=None): if policy_idx is None: - policy_idx = random.randint(0, len(self.policy)-1) + policy_idx = random.randint(0, len(self.policy) - 1) sub_policy = self.policy[policy_idx] for op in sub_policy: @@ -111,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy @@ -137,58 +138,59 @@ class SubPolicy: prob: float, if prob > random prob, apply augment magnitude_idx: int, index of magnitude in preset magnitude ranges """ + def __init__(self, op_name, prob, magnitude_idx): # ranges of operations' magnitude ranges = { - 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) - 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) - 'TranslateX': np.linspace(0, 150 / 331, 10), #[-0.45, 0.45] (by random negative) - 'TranslateY': np.linspace(0, 150 / 331, 10), #[-0.45, 0.45] (by random negative) - 'Rotate': np.linspace(0, 30, 10), #[-30, 30] (by random negative) - 'Color': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative) - 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), #[0, 4] - 'Solarize': np.linspace(256, 0, 10), #[0, 256] - 'Contrast': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative) - 'Sharpness': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative) - 'Brightness': np.linspace(0, 0.9, 10), #[-0.9, 0.9] (by random negative) - 'AutoContrast': [0] * 10, # no range - 'Equalize': [0] * 10, # no range - 'Invert': [0] * 10, # no range + 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'TranslateX': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'TranslateY': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'Rotate': np.linspace(0, 30, 10), # [-30, 30] (by random negative) + 'Color': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), # [0, 4] + 'Solarize': np.linspace(256, 0, 10), # [0, 256] + 'Contrast': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Sharpness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Brightness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'AutoContrast': [0] * 10, # no range + 'Equalize': [0] * 10, # no range + 'Invert': [0] * 10, # no range } - - # augmentation operations + + # augmentation operations # Lambda is not pickleable for DDP - #image_ops = { - # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), - # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), - # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), - # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), - # 'Rotate': lambda image, magnitude: rotate(image, magnitude), - # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), - # 'Invert': lambda image, magnitude: invert(image, magnitude), - # 'Equalize': lambda image, magnitude: equalize(image, magnitude), - # 'Solarize': lambda image, magnitude: solarize(image, magnitude), - # 'Posterize': lambda image, magnitude: posterize(image, magnitude), - # 'Contrast': lambda image, magnitude: contrast(image, magnitude), - # 'Color': lambda image, magnitude: color(image, magnitude), - # 'Brightness': lambda image, magnitude: brightness(image, magnitude), - # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), - #} + # image_ops = { + # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), + # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), + # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), + # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), + # 'Rotate': lambda image, magnitude: rotate(image, magnitude), + # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), + # 'Invert': lambda image, magnitude: invert(image, magnitude), + # 'Equalize': lambda image, magnitude: equalize(image, magnitude), + # 'Solarize': lambda image, magnitude: solarize(image, magnitude), + # 'Posterize': lambda image, magnitude: posterize(image, magnitude), + # 'Contrast': lambda image, magnitude: contrast(image, magnitude), + # 'Color': lambda image, magnitude: color(image, magnitude), + # 'Brightness': lambda image, magnitude: brightness(image, magnitude), + # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), + # } image_ops = { - 'ShearX': shear_x, - 'ShearY': shear_y, - 'TranslateX': translate_x_relative, - 'TranslateY': translate_y_relative, - 'Rotate': rotate, - 'AutoContrast': auto_contrast, - 'Invert': invert, - 'Equalize': equalize, - 'Solarize': solarize, - 'Posterize': posterize, - 'Contrast': contrast, - 'Color': color, - 'Brightness': brightness, - 'Sharpness': sharpness, + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_relative, + 'TranslateY': translate_y_relative, + 'Rotate': rotate, + 'AutoContrast': auto_contrast, + 'Invert': invert, + 'Equalize': equalize, + 'Solarize': solarize, + 'Posterize': posterize, + 'Contrast': contrast, + 'Color': color, + 'Brightness': brightness, + 'Sharpness': sharpness, } self.prob = prob @@ -204,41 +206,41 @@ def __call__(self, image): # PIL Image transforms # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform def shear_x(image, magnitude, fillcolor=(128, 128, 128)): - factor = magnitude * random.choice([-1, 1]) # random negative + factor = magnitude * random.choice([-1, 1]) # random negative return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor) def shear_y(image, magnitude, fillcolor=(128, 128, 128)): - factor = magnitude * random.choice([-1, 1]) # random negative + factor = magnitude * random.choice([-1, 1]) # random negative return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor) def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)): pixels = magnitude * image.size[0] - pixels = pixels * random.choice([-1, 1]) # random negative + pixels = pixels * random.choice([-1, 1]) # random negative return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor) def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)): pixels = magnitude * image.size[0] - pixels = pixels * random.choice([-1, 1]) # random negative + pixels = pixels * random.choice([-1, 1]) # random negative return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor) def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)): - magnitude = magnitude * random.choice([-1, 1]) # random negative + magnitude = magnitude * random.choice([-1, 1]) # random negative return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor) def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)): - magnitude = magnitude * random.choice([-1, 1]) # random negative + magnitude = magnitude * random.choice([-1, 1]) # random negative return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor) def rotate(image, magnitude): rot = image.convert("RGBA").rotate(magnitude) return Image.composite(rot, - Image.new('RGBA', rot.size, (128, ) * 4), + Image.new('RGBA', rot.size, (128,) * 4), rot).convert(image.mode) @@ -263,21 +265,21 @@ def posterize(image, magnitude): def contrast(image, magnitude): - magnitude = magnitude * random.choice([-1, 1]) # random negative + magnitude = magnitude * random.choice([-1, 1]) # random negative return ImageEnhance.Contrast(image).enhance(1 + magnitude) def color(image, magnitude): - magnitude = magnitude * random.choice([-1, 1]) # random negative + magnitude = magnitude * random.choice([-1, 1]) # random negative return ImageEnhance.Color(image).enhance(1 + magnitude) def brightness(image, magnitude): - magnitude = magnitude * random.choice([-1, 1]) # random negative + magnitude = magnitude * random.choice([-1, 1]) # random negative return ImageEnhance.Brightness(image).enhance(1 + magnitude) def sharpness(image, magnitude): - magnitude = magnitude * random.choice([-1, 1]) # random negative + magnitude = magnitude * random.choice([-1, 1]) # random negative return ImageEnhance.Sharpness(image).enhance(1 + magnitude) diff --git a/image_classification/CycleMLP/config.py b/image_classification/CycleMLP/config.py index 66a12636..99754c68 100644 --- a/image_classification/CycleMLP/config.py +++ b/image_classification/CycleMLP/config.py @@ -46,6 +46,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.ATTENTION_DROPOUT = 0.0 +_C.MODEL.DROP_PATH = 0.1 # transformer settings _C.MODEL.MIXER = CN() @@ -60,11 +63,12 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 20 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.BASE_LR = 0.001 _C.TRAIN.WARMUP_START_LR = 5e-7 _C.TRAIN.END_LR = 5e-6 _C.TRAIN.GRAD_CLIP = 5.0 _C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -88,36 +92,23 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" -_C.SAVE_FREQ = 1 # freq to save chpt +_C.SAVE_FREQ = 20 # freq to save chpt _C.REPORT_FREQ = 50 # freq to logging info -_C.VALIDATE_FREQ = 10 # freq to do validation +_C.VALIDATE_FREQ = 20 # freq to do validation _C.SEED = 0 _C.EVAL = False # run evaluation only -_C.AMP = False +_C.AMP = False # mix precision training _C.LOCAL_RANK = 0 _C.NGPUS = -1 @@ -151,6 +142,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/CycleMLP/datasets.py b/image_classification/CycleMLP/datasets.py index 8a1eac06..304df9a3 100644 --- a/image_classification/CycleMLP/datasets.py +++ b/image_classification/CycleMLP/datasets.py @@ -1,185 +1,222 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Dataset related classes and methods for ViT training and validation -Cifar10, Cifar100 and ImageNet2012 are supported -""" - -import os -import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load - -class ImageNet2012Dataset(Dataset): - """Build ImageNet2012 dataset - - This class gets train/val imagenet datasets, which loads transfomed data and labels. - - Attributes: - file_folder: path where imagenet images are stored - transform: preprocessing ops to apply on image - img_path_list: list of full path of images in whole dataset - label_list: list of labels of whole dataset - """ - - def __init__(self, file_folder, mode="train", transform=None): - """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform""" - super(ImageNet2012Dataset, self).__init__() - assert mode in ["train", "val"] - self.file_folder = file_folder - self.transform = transform - self.img_path_list = [] - self.label_list = [] - - if mode == "train": - self.list_file = os.path.join(self.file_folder, "train_list.txt") - else: - self.list_file = os.path.join(self.file_folder, "val_list.txt") - - with open(self.list_file, 'r') as infile: - for line in infile: - img_path = line.strip().split()[0] - img_label = int(line.strip().split()[1]) - self.img_path_list.append(os.path.join(self.file_folder, img_path)) - self.label_list.append(img_label) - print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}') - - def __len__(self): - return len(self.label_list) - - def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') - data = self.transform(data) - label = self.label_list[index] - - return data, label - - -def get_train_transforms(config): - """ Get training transforms - - For training, a RandomResizedCrop is applied, then normalization is applied with - [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.] - Outputs is converted to tensor - - Args: - config: configs contains IMAGE_SIZE, see config.py for details - Returns: - transforms_train: training transforms - """ - - transforms_train = transforms.Compose([ - transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) - return transforms_train - - -def get_val_transforms(config): - """ Get training transforms - - For validation, image is first Resize then CenterCrop to image_size. - Then normalization is applied with [0.5, 0.5, 0.5] mean and std. - The input pixel values must be rescaled to [0, 1.] - Outputs is converted to tensor - - Args: - config: configs contains IMAGE_SIZE, see config.py for details - Returns: - transforms_train: training transforms - """ - - scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) - transforms_val = transforms.Compose([ - transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image - transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) - return transforms_val - - -def get_dataset(config, mode='train'): - """ Get dataset from config and mode (train/val) - - Returns the related dataset object according to configs and mode(train/val) - - Args: - config: configs contains dataset related settings. see config.py for details - Returns: - dataset: dataset object - """ - assert mode in ['train', 'val'] - if config.DATA.DATASET == "cifar10": - if mode == 'train': - dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config)) - else: - mode = 'test' - dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config)) - elif config.DATA.DATASET == "cifar100": - if mode == 'train': - dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config)) - else: - mode = 'test' - dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config)) - elif config.DATA.DATASET == "imagenet2012": - if mode == 'train': - dataset = ImageNet2012Dataset(config.DATA.DATA_PATH, - mode=mode, - transform=get_train_transforms(config)) - else: - dataset = ImageNet2012Dataset(config.DATA.DATA_PATH, - mode=mode, - transform=get_val_transforms(config)) - else: - raise NotImplementedError( - "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now") - return dataset - - -def get_dataloader(config, dataset, mode='train', multi_process=False): - """Get dataloader with config, dataset, mode as input, allows multiGPU settings. - - Multi-GPU loader is implements as distributedBatchSampler. - - Args: - config: see config.py for details - dataset: paddle.io.dataset object - mode: train/val - multi_process: if True, use DistributedBatchSampler to support multi-processing - Returns: - dataloader: paddle.io.DataLoader object. - """ - - if mode == 'train': - batch_size = config.DATA.BATCH_SIZE - else: - batch_size = config.DATA.BATCH_SIZE_EVAL - - if multi_process is True: - sampler = DistributedBatchSampler(dataset, - batch_size=batch_size, - shuffle=(mode == 'train')) - dataloader = DataLoader(dataset, - batch_sampler=sampler, - num_workers=config.DATA.NUM_WORKERS) - else: - dataloader = DataLoader(dataset, - batch_size=batch_size, - num_workers=config.DATA.NUM_WORKERS, - shuffle=(mode == 'train')) - return dataloader +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Dataset related classes and methods for ViT training and validation +Cifar10, Cifar100 and ImageNet2012 are supported +""" + +import os +import math +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from transforms import RandomHorizontalFlip +from random_erasing import RandomErasing + + +class ImageNet2012Dataset(Dataset): + """Build ImageNet2012 dataset + + This class gets train/val imagenet datasets, which loads transfomed data and labels. + + Attributes: + file_folder: path where imagenet images are stored + transform: preprocessing ops to apply on image + img_path_list: list of full path of images in whole dataset + label_list: list of labels of whole dataset + """ + + def __init__(self, file_folder, mode="train", transform=None): + """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform""" + super(ImageNet2012Dataset, self).__init__() + assert mode in ["train", "val"] + self.file_folder = file_folder + self.transform = transform + self.img_path_list = [] + self.label_list = [] + + if mode == "train": + self.list_file = os.path.join(self.file_folder, "train_list.txt") + else: + self.list_file = os.path.join(self.file_folder, "val_list.txt") + + with open(self.list_file, 'r') as infile: + for line in infile: + img_path = line.strip().split()[0] + img_label = int(line.strip().split()[1]) + self.img_path_list.append(os.path.join(self.file_folder, img_path)) + self.label_list.append(img_label) + print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}') + + def __len__(self): + return len(self.label_list) + + def __getitem__(self, index): + data = image_load(self.img_path_list[index]).convert('RGB') + data = self.transform(data) + label = self.label_list[index] + + return data, label + + +def get_train_transforms(config): + """ Get training transforms + + For training, a RandomResizedCrop is applied, then normalization is applied with + [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.] + Outputs is converted to tensor + + Args: + config: configs contains IMAGE_SIZE, see config.py for details + Returns: + transforms_train: training transforms + """ + + aug_op_list = [] + # STEP1: random crop and resize + aug_op_list.append( + transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), + scale=(0.05, 1.0), interpolation='bicubic')) + # STEP2: auto_augment or color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # STEP3: other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # STEP4: random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + # Final: compose transforms and return + transforms_train = transforms.Compose(aug_op_list) + return transforms_train + + +def get_val_transforms(config): + """ Get training transforms + + For validation, image is first Resize then CenterCrop to image_size. + Then normalization is applied with [0.5, 0.5, 0.5] mean and std. + The input pixel values must be rescaled to [0, 1.] + Outputs is converted to tensor + + Args: + config: configs contains IMAGE_SIZE, see config.py for details + Returns: + transforms_train: training transforms + """ + + scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) + transforms_val = transforms.Compose([ + transforms.Resize(scale_size, interpolation='bicubic'), + transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), + ]) + return transforms_val + + +def get_dataset(config, mode='train'): + """ Get dataset from config and mode (train/val) + + Returns the related dataset object according to configs and mode(train/val) + + Args: + config: configs contains dataset related settings. see config.py for details + Returns: + dataset: dataset object + """ + + assert mode in ['train', 'val'] + if config.DATA.DATASET == "cifar10": + if mode == 'train': + dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config)) + else: + mode = 'test' + dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config)) + elif config.DATA.DATASET == "cifar100": + if mode == 'train': + dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config)) + else: + mode = 'test' + dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config)) + elif config.DATA.DATASET == "imagenet2012": + if mode == 'train': + dataset = ImageNet2012Dataset(config.DATA.DATA_PATH, + mode=mode, + transform=get_train_transforms(config)) + else: + dataset = ImageNet2012Dataset(config.DATA.DATA_PATH, + mode=mode, + transform=get_val_transforms(config)) + else: + raise NotImplementedError( + "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now") + return dataset + + +def get_dataloader(config, dataset, mode='train', multi_process=False): + """Get dataloader with config, dataset, mode as input, allows multiGPU settings. + + Multi-GPU loader is implements as distributedBatchSampler. + + Args: + config: see config.py for details + dataset: paddle.io.dataset object + mode: train/val + multi_process: if True, use DistributedBatchSampler to support multi-processing + Returns: + dataloader: paddle.io.DataLoader object. + """ + + if mode == 'train': + batch_size = config.DATA.BATCH_SIZE + else: + batch_size = config.DATA.BATCH_SIZE_EVAL + + if multi_process is True: + sampler = DistributedBatchSampler(dataset, + batch_size=batch_size, + shuffle=(mode == 'train')) + dataloader = DataLoader(dataset, + batch_sampler=sampler, + num_workers=config.DATA.NUM_WORKERS) + else: + dataloader = DataLoader(dataset, + batch_size=batch_size, + num_workers=config.DATA.NUM_WORKERS, + shuffle=(mode == 'train')) + return dataloader diff --git a/image_classification/CycleMLP/droppath.py b/image_classification/CycleMLP/droppath.py index 65e0a782..c8fe8048 100644 --- a/image_classification/CycleMLP/droppath.py +++ b/image_classification/CycleMLP/droppath.py @@ -1,49 +1,50 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth -""" - -import paddle -import paddle.nn as nn - -def drop_path(inputs, drop_prob=0., training=False): - """drop path op - Args: - input: tensor with arbitrary shape - drop_prob: float number of drop path probability, default: 0.0 - training: bool, if current mode is training, default: False - Returns: - output: output tensor after drop path - """ - # if prob is 0 or eval mode, return original input - if drop_prob == 0. or not training: - return inputs - keep_prob = 1 - drop_prob - shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) - random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) - random_tensor = random_tensor.floor() # mask - output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation - return output - - -class DropPath(nn.Layer): - """DropPath class""" - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, inputs): - return drop_path(inputs, self.drop_prob, self.training) +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth +""" + +import paddle +import paddle.nn as nn + +def drop_path(inputs, drop_prob=0., training=False): + """drop path op + Args: + input: tensor with arbitrary shape + drop_prob: float number of drop path probability, default: 0.0 + training: bool, if current mode is training, default: False + Returns: + output: output tensor after drop path + """ + # if prob is 0 or eval mode, return original input + if drop_prob == 0. or not training: + return inputs + keep_prob = 1 - drop_prob + keep_prob = paddle.to_tensor(keep_prob) + shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) + random_tensor = random_tensor.floor() # mask + output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation + return output + + +class DropPath(nn.Layer): + """DropPath class""" + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, inputs): + return drop_path(inputs, self.drop_prob, self.training) diff --git a/image_classification/CycleMLP/main_multi_gpu.py b/image_classification/CycleMLP/main_multi_gpu.py index 99df740a..6e9cb2a3 100644 --- a/image_classification/CycleMLP/main_multi_gpu.py +++ b/image_classification/CycleMLP/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,19 +29,17 @@ from datasets import get_dataset from utils import AverageMeter from utils import WarmupCosineScheduler -from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from cyclemlp import build_cyclemlp as build_model def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('CycleMLP') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -105,11 +104,9 @@ def train(dataloader, local_logger: logger for local process/gpu, default: None master_logger: logger for main process, default: None Returns: - train_loss_meter.avg: float, average loss on current process/gpu - train_acc_meter.avg: float, average top1 accuracy on current process/gpu - master_train_loss_meter.avg: float, average loss on all processes/gpus - master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus - train_time: float, training time + train_loss_meter.avg + train_acc_meter.avg + train_time """ model.train() train_loss_meter = AverageMeter() @@ -132,7 +129,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -358,22 +355,23 @@ def main_worker(*args): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -425,8 +423,8 @@ def main_worker(*args): weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), + #apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -447,9 +445,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/CycleMLP/main_single_gpu.py b/image_classification/CycleMLP/main_single_gpu.py index 609b3be4..36b55c1c 100644 --- a/image_classification/CycleMLP/main_single_gpu.py +++ b/image_classification/CycleMLP/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,13 +34,12 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from cyclemlp import build_cyclemlp as build_model def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('CycleMLP') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -126,7 +125,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -269,19 +268,23 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -291,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -330,10 +332,7 @@ def main(): beta2=config.TRAIN.OPTIMIZER.BETAS[1], weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, - grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), - ) + grad_clip=clip) else: logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -348,11 +347,11 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") diff --git a/image_classification/CycleMLP/transforms.py b/image_classification/CycleMLP/transforms.py index 676fe1ff..5a046912 100644 --- a/image_classification/CycleMLP/transforms.py +++ b/image_classification/CycleMLP/transforms.py @@ -1,3 +1,4 @@ +import random import paddle import paddle.nn import paddle.vision.transforms as T diff --git a/image_classification/CycleMLP/utils.py b/image_classification/CycleMLP/utils.py index f5bdb636..44800527 100644 --- a/image_classification/CycleMLP/utils.py +++ b/image_classification/CycleMLP/utils.py @@ -1,120 +1,120 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""utils for ViT - -Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training -and WarmupCosineScheduler for training - -""" - -import math -from paddle.optimizer.lr import LRScheduler - - -class AverageMeter(): - """ Meter for monitoring losses""" - def __init__(self): - self.avg = 0 - self.sum = 0 - self.cnt = 0 - self.reset() - - def reset(self): - """reset all values to zeros""" - self.avg = 0 - self.sum = 0 - self.cnt = 0 - - def update(self, val, n=1): - """update avg by val and n, where val is the avg of n values""" - self.sum += val * n - self.cnt += n - self.avg = self.sum / self.cnt - - - -def get_exclude_from_weight_decay_fn(exclude_list=[]): - """ Set params with no weight decay during the training - - For certain params, e.g., positional encoding in ViT, weight decay - may not needed during the learning, this method is used to find - these params. - - Args: - exclude_list: a list of params names which need to exclude - from weight decay. - Returns: - exclude_from_weight_decay_fn: a function returns True if param - will be excluded from weight decay - """ - if len(exclude_list) == 0: - exclude_from_weight_decay_fn = None - else: - def exclude_fn(param): - for name in exclude_list: - if param.endswith(name): - return False - return True - exclude_from_weight_decay_fn = exclude_fn - return exclude_from_weight_decay_fn - - -class WarmupCosineScheduler(LRScheduler): - """Warmup Cosine Scheduler - - First apply linear warmup, then apply cosine decay schedule. - Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs" - Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining - "total_epochs - warmup_epochs" - - Attributes: - learning_rate: the starting learning rate (without warmup), not used here! - warmup_start_lr: warmup starting learning rate - start_lr: the starting learning rate (without warmup) - end_lr: the ending learning rate after whole loop - warmup_epochs: # of epochs for warmup - total_epochs: # of total epochs (include warmup) - """ - def __init__(self, - learning_rate, - warmup_start_lr, - start_lr, - end_lr, - warmup_epochs, - total_epochs, - cycles=0.5, - last_epoch=-1, - verbose=False): - """init WarmupCosineScheduler """ - self.warmup_epochs = warmup_epochs - self.total_epochs = total_epochs - self.warmup_start_lr = warmup_start_lr - self.start_lr = start_lr - self.end_lr = end_lr - self.cycles = cycles - super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose) - - def get_lr(self): - """ return lr value """ - if self.last_epoch < self.warmup_epochs: - val = (self.start_lr - self.warmup_start_lr) * float( - self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr - return val - - progress = float(self.last_epoch - self.warmup_epochs) / float( - max(1, self.total_epochs - self.warmup_epochs)) - val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) - val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr) - return val +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""utils for ViT + +Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training +and WarmupCosineScheduler for training + +""" + +import math +from paddle.optimizer.lr import LRScheduler + + +class AverageMeter(): + """ Meter for monitoring losses""" + def __init__(self): + self.avg = 0 + self.sum = 0 + self.cnt = 0 + self.reset() + + def reset(self): + """reset all values to zeros""" + self.avg = 0 + self.sum = 0 + self.cnt = 0 + + def update(self, val, n=1): + """update avg by val and n, where val is the avg of n values""" + self.sum += val * n + self.cnt += n + self.avg = self.sum / self.cnt + + + +def get_exclude_from_weight_decay_fn(exclude_list=[]): + """ Set params with no weight decay during the training + + For certain params, e.g., positional encoding in ViT, weight decay + may not needed during the learning, this method is used to find + these params. + + Args: + exclude_list: a list of params names which need to exclude + from weight decay. + Returns: + exclude_from_weight_decay_fn: a function returns True if param + will be excluded from weight decay + """ + if len(exclude_list) == 0: + exclude_from_weight_decay_fn = None + else: + def exclude_fn(param): + for name in exclude_list: + if param.endswith(name): + return False + return True + exclude_from_weight_decay_fn = exclude_fn + return exclude_from_weight_decay_fn + + +class WarmupCosineScheduler(LRScheduler): + """Warmup Cosine Scheduler + + First apply linear warmup, then apply cosine decay schedule. + Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs" + Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining + "total_epochs - warmup_epochs" + + Attributes: + learning_rate: the starting learning rate (without warmup), not used here! + warmup_start_lr: warmup starting learning rate + start_lr: the starting learning rate (without warmup) + end_lr: the ending learning rate after whole loop + warmup_epochs: # of epochs for warmup + total_epochs: # of total epochs (include warmup) + """ + def __init__(self, + learning_rate, + warmup_start_lr, + start_lr, + end_lr, + warmup_epochs, + total_epochs, + cycles=0.5, + last_epoch=-1, + verbose=False): + """init WarmupCosineScheduler """ + self.warmup_epochs = warmup_epochs + self.total_epochs = total_epochs + self.warmup_start_lr = warmup_start_lr + self.start_lr = start_lr + self.end_lr = end_lr + self.cycles = cycles + super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose) + + def get_lr(self): + """ return lr value """ + if self.last_epoch < self.warmup_epochs: + val = (self.start_lr - self.warmup_start_lr) * float( + self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr + return val + + progress = float(self.last_epoch - self.warmup_epochs) / float( + max(1, self.total_epochs - self.warmup_epochs)) + val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) + val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr) + return val diff --git a/image_classification/DeiT/augment.py b/image_classification/DeiT/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/DeiT/augment.py +++ b/image_classification/DeiT/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/DeiT/config.py b/image_classification/DeiT/config.py index 4bc970a7..4b023f60 100644 --- a/image_classification/DeiT/config.py +++ b/image_classification/DeiT/config.py @@ -63,15 +63,16 @@ _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 _C.TRAIN.NUM_EPOCHS = 300 -_C.TRAIN.WARMUP_EPOCHS = 20 +_C.TRAIN.WARMUP_EPOCHS = 5 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 0.001 +_C.TRAIN.BASE_LR = 0.0005 _C.TRAIN.WARMUP_START_LR = 1e-6 _C.TRAIN.END_LR = 1e-5 _C.TRAIN.GRAD_CLIP = None -_C.TRAIN.ACCUM_ITER = 2 +_C.TRAIN.ACCUM_ITER = 1 _C.TRAIN.MODEL_EMA = True _C.TRAIN.MODEL_EMA_DECAY = 0.99992 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -86,20 +87,21 @@ _C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 # train augmentation -_C.TRAIN.MIXUP_ALPHA = 0.8 -_C.TRAIN.CUTMIX_ALPHA = 1.0 -_C.TRAIN.CUTMIX_MINMAX = None -_C.TRAIN.MIXUP_PROB = 1.0 -_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 -_C.TRAIN.MIXUP_MODE = 'batch' +_C.TRAIN.MIXUP_ALPHA = 0.8 # mixup alpha, enabled if >0 +_C.TRAIN.CUTMIX_ALPHA = 1.0 # cutmix alpha, enabled if >0 +_C.TRAIN.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha +_C.TRAIN.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled +_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled +_C.TRAIN.MIXUP_MODE = 'batch' # how to apply mixup/cutmix params, per 'batch', 'pair' or 'elem' _C.TRAIN.SMOOTHING = 0.1 -_C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.COLOR_JITTER = 0.4 # color jitter factor +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True -_C.TRAIN.RANDOM_ERASE_PROB = 0.25 -_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' -_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 # random erase prob +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # random erase mode +_C.TRAIN.RANDOM_ERASE_COUNT = 1 # random erase count _C.TRAIN.RANDOM_ERASE_SPLIT = False _C.TRAIN.DISTILLATION_TYPE = 'hard' # hard, soft, none @@ -107,22 +109,6 @@ _C.TRAIN.DISTILLATION_TAU = 1.0 _C.TRAIN.TEACHER_MODEL = './regnety_160' # no ext is needed -_C.TRAIN.MODEL_EMA = True -_C.TRAIN.MODEL_EMA_DECAY = 0.99996 - -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" _C.TAG = "default" @@ -185,8 +171,6 @@ def update_config(config, args): config.AMP = False else: config.AMP = True - if args.teacher_model: - config.TRAIN.TEACHER_MODEL = args.teacher_model #config.freeze() return config diff --git a/image_classification/DeiT/configs/deit_base_patch16_224.yaml b/image_classification/DeiT/configs/deit_base_patch16_224.yaml index dd0f608d..28220114 100644 --- a/image_classification/DeiT/configs/deit_base_patch16_224.yaml +++ b/image_classification/DeiT/configs/deit_base_patch16_224.yaml @@ -14,10 +14,11 @@ MODEL: TRAIN: NUM_EPOCHS: 300 WARMUP_EPOCHS: 5 - WEIGHT_DECAY: 0.3 - BASE_LR: 0.003 + WEIGHT_DECAY: 0.05 + BASE_LR: 0.0005 WARMUP_START_LR: 1e-6 END_LR: 5e-4 ACCUM_ITER: 2 + LINEAR_SCALED_LR: 512 diff --git a/image_classification/DeiT/configs/deit_small_patch16_224.yaml b/image_classification/DeiT/configs/deit_small_patch16_224.yaml index c9a4745d..8aa973c8 100644 --- a/image_classification/DeiT/configs/deit_small_patch16_224.yaml +++ b/image_classification/DeiT/configs/deit_small_patch16_224.yaml @@ -14,10 +14,11 @@ MODEL: TRAIN: NUM_EPOCHS: 300 WARMUP_EPOCHS: 5 - WEIGHT_DECAY: 0.3 - BASE_LR: 0.003 + WEIGHT_DECAY: 0.05 + BASE_LR: 0.0005 WARMUP_START_LR: 1e-6 END_LR: 5e-4 ACCUM_ITER: 2 + LINEAR_SCALED_LR: 512 diff --git a/image_classification/DeiT/configs/deit_tiny_patch16_224.yaml b/image_classification/DeiT/configs/deit_tiny_patch16_224.yaml index 2d862360..272d33b6 100644 --- a/image_classification/DeiT/configs/deit_tiny_patch16_224.yaml +++ b/image_classification/DeiT/configs/deit_tiny_patch16_224.yaml @@ -19,5 +19,6 @@ TRAIN: WARMUP_START_LR: 1e-6 END_LR: 1e-5 ACCUM_ITER: 1 + LINEAR_SCALED_LR: 512 diff --git a/image_classification/DeiT/datasets.py b/image_classification/DeiT/datasets.py index d40efa77..984e1fcf 100644 --- a/image_classification/DeiT/datasets.py +++ b/image_classification/DeiT/datasets.py @@ -20,12 +20,20 @@ import os import math from PIL import Image -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from transforms import RandomHorizontalFlip from random_erasing import RandomErasing + class ImageNet2012Dataset(Dataset): """Build ImageNet2012 dataset @@ -93,9 +101,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/DeiT/deit.py b/image_classification/DeiT/deit.py index 2b0bb81e..1508f62a 100644 --- a/image_classification/DeiT/deit.py +++ b/image_classification/DeiT/deit.py @@ -101,8 +101,8 @@ def __init__(self, in_features, hidden_features, dropout=0.): self.dropout = nn.Dropout(dropout) def _init_weights(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.XavierUniform()) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(std=1e-6)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -140,12 +140,25 @@ def __init__(self, self.dim_head = dim // num_heads self.scale = qk_scale or self.dim_head ** -0.5 - self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + w_attr_1, b_attr_1 = self._init_weights() + self.qkv = nn.Linear(dim, + dim * 3, + weight_attr=w_attr_1, + bias_attr=b_attr_1 if qkv_bias else False) self.attn_dropout = nn.Dropout(attention_dropout) self.softmax = nn.Softmax(axis=-1) - self.proj = nn.Linear(dim, dim) + w_attr_2, b_attr_2 = self._init_weights() + self.proj = nn.Linear(dim, + dim, + weight_attr=w_attr_2, + bias_attr=b_attr_2) self.proj_dropout = nn.Dropout(dropout) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def transpose_multihead(self, x): new_shape = x.shape[:-1] + [self.num_heads, self.dim_head] x = x.reshape(new_shape) @@ -196,17 +209,30 @@ def __init__(self, attention_dropout=0, droppath=0.): super().__init__() - self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) + w_attr_1, b_attr_1 = self._init_weights() + self.norm1 = nn.LayerNorm(dim, + weight_attr=w_attr_1, + bias_attr=b_attr_1, + epsilon=1e-6) self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attention_dropout=attention_dropout) self.drop_path = DropPath(droppath) if droppath > 0. else Identity() - self.norm2 = nn.LayerNorm(dim, epsilon=1e-6) + w_attr_2, b_attr_2 = self._init_weights() + self.norm2 = nn.LayerNorm(dim, + weight_attr=w_attr_2, + bias_attr=b_attr_2, + epsilon=1e-6) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio)) + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr + def forward(self, x): h = x x = self.norm1(x) @@ -268,10 +294,32 @@ def __init__(self, qkv_bias=qkv_bias, attention_dropout=attention_dropout, droppath=droppath)) for _ in range(depth)]) - self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) + w_attr_1, b_attr_1 = self._init_weights_norm() + self.norm = nn.LayerNorm(embed_dim, + weight_attr=w_attr_1, + bias_attr=b_attr_1, + epsilon=1e-6) + + w_attr_2, b_attr_2 = self._init_weights_linear() + self.head = nn.Linear(embed_dim, + num_classes, + weight_attr=w_attr_2, + bias_attr=b_attr_2) + w_attr_3, b_attr_3 = self._init_weights_linear() + self.head_distill = nn.Linear(embed_dim, + num_classes, + weight_attr=w_attr_3, + bias_attr=b_attr_3) + + def _init_weights_linear(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr - self.head = nn.Linear(embed_dim, num_classes) - self.head_distill = nn.Linear(embed_dim, num_classes) + def _init_weights_norm(self): + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) + return weight_attr, bias_attr def forward_features(self, x): x = self.patch_embed(x) diff --git a/image_classification/DeiT/main_single_gpu.py b/image_classification/DeiT/main_single_gpu.py index 029ba8a1..6db48969 100644 --- a/image_classification/DeiT/main_single_gpu.py +++ b/image_classification/DeiT/main_single_gpu.py @@ -306,19 +306,23 @@ def main(): # STEP 6: Define optimizer and lr_scheduler # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -388,12 +392,12 @@ def main(): assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") # load ema model - if model_ema is not None and os.path.isfidile(config.MODEL.RESUME + '-EMA.pdparams'): + if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') model_ema.module.set_state_dict(model_ema_state) logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') diff --git a/image_classification/DeiT/transforms.py b/image_classification/DeiT/transforms.py new file mode 100644 index 00000000..5a046912 --- /dev/null +++ b/image_classification/DeiT/transforms.py @@ -0,0 +1,14 @@ +import random +import paddle +import paddle.nn +import paddle.vision.transforms as T + + +class RandomHorizontalFlip(): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, image): + if random.random() < self.p: + return T.hflip(image) + return image diff --git a/image_classification/FF_Only/augment.py b/image_classification/FF_Only/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/FF_Only/augment.py +++ b/image_classification/FF_Only/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/FF_Only/config.py b/image_classification/FF_Only/config.py index 7cf44397..47ceef42 100644 --- a/image_classification/FF_Only/config.py +++ b/image_classification/FF_Only/config.py @@ -46,6 +46,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.ATTENTION_DROPOUT = 0.0 +_C.MODEL.DROP_PATH = 0.1 # transformer settings _C.MODEL.MIXER = CN() @@ -62,6 +65,7 @@ _C.TRAIN.END_LR = 5e-6 _C.TRAIN.GRAD_CLIP = 5.0 _C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -85,33 +89,20 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" -_C.SAVE_FREQ = 1 # freq to save chpt +_C.SAVE_FREQ = 20 # freq to save chpt _C.REPORT_FREQ = 50 # freq to logging info -_C.VALIDATE_FREQ = 10 # freq to do validation +_C.VALIDATE_FREQ = 20 # freq to do validation _C.SEED = 0 _C.EVAL = False # run evaluation only _C.AMP = False @@ -148,6 +139,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/FF_Only/datasets.py b/image_classification/FF_Only/datasets.py index 064faebe..304df9a3 100644 --- a/image_classification/FF_Only/datasets.py +++ b/image_classification/FF_Only/datasets.py @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,9 +28,12 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing + class ImageNet2012Dataset(Dataset): """Build ImageNet2012 dataset @@ -98,9 +102,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/FF_Only/droppath.py b/image_classification/FF_Only/droppath.py index d7ecf00c..c8fe8048 100644 --- a/image_classification/FF_Only/droppath.py +++ b/image_classification/FF_Only/droppath.py @@ -16,10 +16,29 @@ Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth """ -import numpy as np import paddle import paddle.nn as nn +def drop_path(inputs, drop_prob=0., training=False): + """drop path op + Args: + input: tensor with arbitrary shape + drop_prob: float number of drop path probability, default: 0.0 + training: bool, if current mode is training, default: False + Returns: + output: output tensor after drop path + """ + # if prob is 0 or eval mode, return original input + if drop_prob == 0. or not training: + return inputs + keep_prob = 1 - drop_prob + keep_prob = paddle.to_tensor(keep_prob) + shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) + random_tensor = random_tensor.floor() # mask + output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation + return output + class DropPath(nn.Layer): """DropPath class""" @@ -27,35 +46,5 @@ def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob - def drop_path(self, inputs): - """drop path op - Args: - input: tensor with arbitrary shape - drop_prob: float number of drop path probability, default: 0.0 - training: bool, if current mode is training, default: False - Returns: - output: output tensor after drop path - """ - # if prob is 0 or eval mode, return original input - if self.drop_prob == 0. or not self.training: - return inputs - keep_prob = 1 - self.drop_prob - keep_prob = paddle.to_tensor(keep_prob, dtype='float32') - shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) - random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) - random_tensor = random_tensor.floor() # mask - output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation - return output - def forward(self, inputs): - return self.drop_path(inputs) - - -#def main(): -# tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32') -# dp = DropPath(0.5) -# out = dp(tmp) -# print(out) -# -#if __name__ == "__main__": -# main() + return drop_path(inputs, self.drop_prob, self.training) diff --git a/image_classification/FF_Only/main_multi_gpu.py b/image_classification/FF_Only/main_multi_gpu.py index 25252ad5..489688a3 100644 --- a/image_classification/FF_Only/main_multi_gpu.py +++ b/image_classification/FF_Only/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""FF_only training/validation using multiple GPU """ +"""FFOnly training/validation using multiple GPU """ import sys import os @@ -29,19 +29,17 @@ from datasets import get_dataset from utils import AverageMeter from utils import WarmupCosineScheduler -from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from ffonly import build_ffonly as build_model def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('FFOnly') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -105,11 +104,9 @@ def train(dataloader, local_logger: logger for local process/gpu, default: None master_logger: logger for main process, default: None Returns: - train_loss_meter.avg: float, average loss on current process/gpu - train_acc_meter.avg: float, average top1 accuracy on current process/gpu - master_train_loss_meter.avg: float, average loss on all processes/gpus - master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus - train_time: float, training time + train_loss_meter.avg + train_acc_meter.avg + train_time """ model.train() train_loss_meter = AverageMeter() @@ -132,7 +129,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -358,22 +355,23 @@ def main_worker(*args): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -425,8 +423,8 @@ def main_worker(*args): weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), + #apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -447,9 +445,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/FF_Only/main_single_gpu.py b/image_classification/FF_Only/main_single_gpu.py index d904ca23..4a9cbd27 100644 --- a/image_classification/FF_Only/main_single_gpu.py +++ b/image_classification/FF_Only/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""FF_only training/validation using single GPU """ +"""FFOnly training/validation using single GPU """ import sys import os @@ -35,13 +34,12 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from ffonly import build_ffonly as build_model def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('FFOnly') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -126,7 +125,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -269,19 +268,23 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -291,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -330,10 +332,7 @@ def main(): beta2=config.TRAIN.OPTIMIZER.BETAS[1], weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, - grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), - ) + grad_clip=clip) else: logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -348,11 +347,11 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") diff --git a/image_classification/FF_Only/transforms.py b/image_classification/FF_Only/transforms.py index 676fe1ff..5a046912 100644 --- a/image_classification/FF_Only/transforms.py +++ b/image_classification/FF_Only/transforms.py @@ -1,3 +1,4 @@ +import random import paddle import paddle.nn import paddle.vision.transforms as T diff --git a/image_classification/FF_Only/utils.py b/image_classification/FF_Only/utils.py index da7c5169..44800527 100644 --- a/image_classification/FF_Only/utils.py +++ b/image_classification/FF_Only/utils.py @@ -21,30 +21,6 @@ import math from paddle.optimizer.lr import LRScheduler -import numpy as np - - -class MyPrint(): - """" Print tensor and its shape, used for debug """ - def __init__(self): - self.cnt = 0 - def myprint(self, prefix, var, cnt=None, save=None): - """print tensor and its shape, optionly save to npy - Args: - prefix: str, print info in 1st and last lines - var: Tensor, tensor needs to print - cnt: int, if self.cnt is exceed this value, print will stop - save: str, file name (should end with .npy) to save the tensor, if None no save - """ - if cnt is None or self.cnt < cnt: - print(f'------------ {prefix} ---------------') - print(var.shape, var) - print(f'------------ END {prefix} ---------------') - if save is not None: - var = var.numpy() - with open(save,'wb') as ofile: - np.save(ofile, var) - self.cnt += 1 class AverageMeter(): @@ -53,7 +29,6 @@ def __init__(self): self.avg = 0 self.sum = 0 self.cnt = 0 - self.val = 0 self.reset() def reset(self): @@ -61,11 +36,9 @@ def reset(self): self.avg = 0 self.sum = 0 self.cnt = 0 - self.val = 0 def update(self, val, n=1): """update avg by val and n, where val is the avg of n values""" - self.val = val self.sum += val * n self.cnt += n self.avg = self.sum / self.cnt diff --git a/image_classification/Focal_Transformer/augment.py b/image_classification/Focal_Transformer/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/Focal_Transformer/augment.py +++ b/image_classification/Focal_Transformer/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/Focal_Transformer/config.py b/image_classification/Focal_Transformer/config.py index 16b5e920..2bb5e081 100644 --- a/image_classification/Focal_Transformer/config.py +++ b/image_classification/Focal_Transformer/config.py @@ -96,9 +96,9 @@ _C.TRAIN.BASE_LR = 5e-4 _C.TRAIN.WARMUP_START_LR = 5e-7 _C.TRAIN.END_LR = 5e-6 -_C.TRAIN.GRAD_CLIP = 5.0 # Clip gradient norm -_C.TRAIN.ACCUM_ITER = 1 # Gradient accumulation steps - +_C.TRAIN.GRAD_CLIP = 5.0 +_C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None # LR scheduler _C.TRAIN.LR_SCHEDULER = CN() @@ -128,7 +128,8 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # How to apply mixup/cutmix params. Per "batch", "pair", or "elem" @@ -179,8 +180,8 @@ def update_config(config, args): Return: config: updated config """ - _update_config_from_file(config, args.cfg) - + if args.cfg: + _update_config_from_file(config, args.cfg) config.defrost() # merge from specific arguments if args.dataset: @@ -193,6 +194,8 @@ def update_config(config, args): config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path + if args.output is not None: + config.SAVE = args.output if args.ngpus: config.NGPUS = args.ngpus if args.eval: @@ -234,4 +237,4 @@ def get_config(cfg_file=None): config = _C.clone() if cfg_file: _update_config_from_file(config, cfg_file) - return config \ No newline at end of file + return config diff --git a/image_classification/Focal_Transformer/datasets.py b/image_classification/Focal_Transformer/datasets.py index 93625837..cc793941 100644 --- a/image_classification/Focal_Transformer/datasets.py +++ b/image_classification/Focal_Transformer/datasets.py @@ -25,7 +25,9 @@ from paddle.vision import datasets from paddle.vision import image_load from augment import auto_augment_policy_original -from augment import AutoAugment +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -94,6 +96,10 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 aug_op_list.append(transforms.ColorJitter(*jitter)) diff --git a/image_classification/Focal_Transformer/main_multi_gpu.py b/image_classification/Focal_Transformer/main_multi_gpu.py index 23edfab2..1429c427 100644 --- a/image_classification/Focal_Transformer/main_multi_gpu.py +++ b/image_classification/Focal_Transformer/main_multi_gpu.py @@ -35,7 +35,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from focal_transformer import build_focal as build_model @@ -142,7 +141,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -368,22 +367,23 @@ def main_worker(*args): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -457,9 +457,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) @@ -590,4 +590,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/image_classification/Focal_Transformer/main_single_gpu.py b/image_classification/Focal_Transformer/main_single_gpu.py index 31be3bed..10d47f16 100644 --- a/image_classification/Focal_Transformer/main_single_gpu.py +++ b/image_classification/Focal_Transformer/main_single_gpu.py @@ -34,7 +34,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from focal_transformer import build_focal as build_model @@ -135,7 +134,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -278,19 +277,23 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -300,8 +303,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -357,9 +359,9 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) @@ -430,4 +432,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/image_classification/Focal_Transformer/run_train_multi.sh b/image_classification/Focal_Transformer/run_train_multi.sh index 41599866..a65a1943 100644 --- a/image_classification/Focal_Transformer/run_train_multi.sh +++ b/image_classification/Focal_Transformer/run_train_multi.sh @@ -1,9 +1,9 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python main_single_gpu.py \ +python main_multi_gpu.py \ -cfg='./configs/focal_tiny_patch4_window7_224.yaml' \ -dataset='imagenet2012' \ -num_classes=1000 \ -batch_size=4 \ -image_size=224 \ -data_path='/dataset/imagenet' \ - -output='./output' \ No newline at end of file + -output='./output' diff --git a/image_classification/HVT/augment.py b/image_classification/HVT/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/HVT/augment.py +++ b/image_classification/HVT/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/HVT/config.py b/image_classification/HVT/config.py index 56d622f2..9c9dacf9 100644 --- a/image_classification/HVT/config.py +++ b/image_classification/HVT/config.py @@ -67,14 +67,14 @@ _C.TRAIN.LAST_EPOCH = 0 _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 20 -_C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 0.001 +_C.TRAIN.WEIGHT_DECAY = 0.025 +_C.TRAIN.BASE_LR = 0.0005 _C.TRAIN.WARMUP_START_LR = 1e-6 _C.TRAIN.END_LR = 1e-5 _C.TRAIN.GRAD_CLIP = None -_C.TRAIN.ACCUM_ITER = 2 -_C.TRAIN.MODEL_EMA = True -_C.TRAIN.MODEL_EMA_DECAY = 0.99992 +_C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.MODEL_EMA = False +_C.TRAIN.MODEL_EMA_DECAY = 0.99996 _C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() @@ -100,33 +100,13 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 _C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -_C.TRAIN.DISTILLATION_TYPE = 'hard' # hard, soft, none -_C.TRAIN.DISTILLATION_ALPHA = 0.5 -_C.TRAIN.DISTILLATION_TAU = 1.0 -_C.TRAIN.TEACHER_MODEL = './regnety_160' # no ext is needed - -_C.TRAIN.MODEL_EMA = True -_C.TRAIN.MODEL_EMA_DECAY = 0.99996 - -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" _C.TAG = "default" @@ -169,8 +149,12 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path + if args.output is not None: + config.SAVE = args.output if args.ngpus: config.NGPUS = args.ngpus if args.eval: @@ -188,6 +172,7 @@ def update_config(config, args): else: config.AMP = True + #config.freeze() return config diff --git a/image_classification/HVT/datasets.py b/image_classification/HVT/datasets.py index 8ac5d364..18448892 100644 --- a/image_classification/HVT/datasets.py +++ b/image_classification/HVT/datasets.py @@ -25,8 +25,11 @@ from paddle.io import DistributedBatchSampler from paddle.vision import transforms from paddle.vision import datasets +from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from random_erasing import RandomErasing @@ -68,7 +71,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = Image.open(self.img_path_list[index]).convert('RGB') + data = image_load(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -89,7 +92,6 @@ def get_train_transforms(config): """ aug_op_list = [] # STEP1: random crop and resize - aug_op_list.append(RandomHorizontalFlip(0.5)) aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), scale=(0.05, 1.0), interpolation='bicubic')) @@ -98,6 +100,10 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 aug_op_list.append(transforms.ColorJitter(*jitter)) diff --git a/image_classification/HVT/main_multi_gpu.py b/image_classification/HVT/main_multi_gpu.py index 84a0beb5..e2c04793 100644 --- a/image_classification/HVT/main_multi_gpu.py +++ b/image_classification/HVT/main_multi_gpu.py @@ -47,7 +47,9 @@ def get_arguments(): parser.add_argument('-batch_size', type=int, default=None) parser.add_argument('-image_size', type=int, default=None) parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -154,7 +156,7 @@ def train(dataloader, model_ema.update(model) # average of output and kd_output, like model eval mode - pred = F.softmax((output[0] + output[1]) / 2) + pred = F.softmax(output) if mixup_fn: acc = paddle.metric.accuracy(pred, label_orig) else: diff --git a/image_classification/HVT/main_single_gpu.py b/image_classification/HVT/main_single_gpu.py index 8c79f823..02b91ddd 100644 --- a/image_classification/HVT/main_single_gpu.py +++ b/image_classification/HVT/main_single_gpu.py @@ -18,6 +18,7 @@ import os import time import logging +import copy import argparse import random import numpy as np @@ -45,7 +46,9 @@ def get_arguments(): parser.add_argument('-batch_size', type=int, default=None) parser.add_argument('-image_size', type=int, default=None) parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -147,7 +150,7 @@ def train(dataloader, model_ema.update(model) # average of output and kd_output, like model eval mode - pred = F.softmax((output[0] + output[1]) / 2) + pred = F.softmax(output) if mixup_fn: acc = paddle.metric.accuracy(pred, label_orig) else: @@ -278,19 +281,23 @@ def main(): # STEP 5: Define optimizer and lr_scheduler # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -358,12 +365,12 @@ def main(): assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") # load ema model - if model_ema is not None and os.path.isfidile(config.MODEL.RESUME + '-EMA.pdparams'): + if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') model_ema.module.set_state_dict(model_ema_state) logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') diff --git a/image_classification/HaloNet/augment.py b/image_classification/HaloNet/augment.py new file mode 100644 index 00000000..7a7f081c --- /dev/null +++ b/image_classification/HaloNet/augment.py @@ -0,0 +1,285 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Augmentation""" +""" Rand Augmentation """ +# reference: RandAugment: Practical automated data augmentation with a reduced search space +# https://arxiv.org/abs/1909.13719 + +""" Auto Augmentation """ +# reference: AutoAugment: Learning Augmentation Policies from Data +# https://arxiv.org/abs/1805.09501 + +import random +import numpy as np +from PIL import Image, ImageEnhance, ImageOps + + +def auto_augment_policy_original(): + """25 types of augment policies in original paper""" + policy = [ + [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy] + return policy + + +def rand_augment_policy_original(magnitude_idx=9): + """ + 14 types of augment policies in original paper + Args: + magnitude_idx: M + """ + policy = [ + ('Posterize', 1, magnitude_idx), ('Rotate', 1, magnitude_idx), + ('Solarize', 1, magnitude_idx), ('AutoContrast', 1, magnitude_idx), + ('Equalize', 1, magnitude_idx), ('Contrast', 1, magnitude_idx), + ('Color', 1, magnitude_idx), ('Invert', 1, magnitude_idx), + ('Sharpness', 1, magnitude_idx), ('Brightness', 1, magnitude_idx), + ('ShearX', 1, magnitude_idx), ('ShearY', 1, magnitude_idx), + ('TranslateX', 1, magnitude_idx), ('TranslateY', 1, magnitude_idx), + ] + policy = [SubPolicy(*args) for args in policy] + return policy + + +class AutoAugment(): + """Auto Augment + Randomly choose a tuple of augment ops from a list of policy + Then apply the tuple of augment ops to input image + + Examples: + policy = auto_augment_policy_original() + augment = AutoAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy): + self.policy = policy + + def __call__(self, image, policy_idx=None): + if policy_idx is None: + policy_idx = random.randint(0, len(self.policy) - 1) + + sub_policy = self.policy[policy_idx] + for op in sub_policy: + image = op(image) + return image + + +class RandAugment(): + """Rand Augment + Randomly choose N augment ops from a list of K policies + Then apply the N ops to input image + + Examples: + policy = rand_augment_policy_original(magnitude_idx) + augment = RandAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy, num_layers=2): + """ + Args: + policy: list of SubPolicy + num_layers: int + """ + self.policy = policy + self.num_layers = num_layers + + def __call__(self, image): + selected_idx = np.random.choice(len(self.policy), self.num_layers) + + for policy_idx in selected_idx: + sub_policy = self.policy[policy_idx] + image = sub_policy(image) + return image + + +class SubPolicy: + """Subpolicy + Read augment name and magnitude, apply augment with probability + Args: + op_name: str, augment operation name + prob: float, if prob > random prob, apply augment + magnitude_idx: int, index of magnitude in preset magnitude ranges + """ + + def __init__(self, op_name, prob, magnitude_idx): + # ranges of operations' magnitude + ranges = { + 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'TranslateX': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'TranslateY': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'Rotate': np.linspace(0, 30, 10), # [-30, 30] (by random negative) + 'Color': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), # [0, 4] + 'Solarize': np.linspace(256, 0, 10), # [0, 256] + 'Contrast': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Sharpness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Brightness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'AutoContrast': [0] * 10, # no range + 'Equalize': [0] * 10, # no range + 'Invert': [0] * 10, # no range + } + + # augmentation operations + # Lambda is not pickleable for DDP + # image_ops = { + # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), + # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), + # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), + # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), + # 'Rotate': lambda image, magnitude: rotate(image, magnitude), + # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), + # 'Invert': lambda image, magnitude: invert(image, magnitude), + # 'Equalize': lambda image, magnitude: equalize(image, magnitude), + # 'Solarize': lambda image, magnitude: solarize(image, magnitude), + # 'Posterize': lambda image, magnitude: posterize(image, magnitude), + # 'Contrast': lambda image, magnitude: contrast(image, magnitude), + # 'Color': lambda image, magnitude: color(image, magnitude), + # 'Brightness': lambda image, magnitude: brightness(image, magnitude), + # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), + # } + image_ops = { + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_relative, + 'TranslateY': translate_y_relative, + 'Rotate': rotate, + 'AutoContrast': auto_contrast, + 'Invert': invert, + 'Equalize': equalize, + 'Solarize': solarize, + 'Posterize': posterize, + 'Contrast': contrast, + 'Color': color, + 'Brightness': brightness, + 'Sharpness': sharpness, + } + + self.prob = prob + self.magnitude = ranges[op_name][magnitude_idx] + self.op = image_ops[op_name] + + def __call__(self, image): + if self.prob > random.random(): + image = self.op(image, self.magnitude) + return image + + +# PIL Image transforms +# https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform +def shear_x(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor) + + +def shear_y(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor) + + +def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor) + + +def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor) + + +def rotate(image, magnitude): + rot = image.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new('RGBA', rot.size, (128,) * 4), + rot).convert(image.mode) + + +def auto_contrast(image, magnitude=None): + return ImageOps.autocontrast(image) + + +def invert(image, magnitude=None): + return ImageOps.invert(image) + + +def equalize(image, magnitude=None): + return ImageOps.equalize(image) + + +def solarize(image, magnitude): + return ImageOps.solarize(image, magnitude) + + +def posterize(image, magnitude): + return ImageOps.posterize(image, magnitude) + + +def contrast(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Contrast(image).enhance(1 + magnitude) + + +def color(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Color(image).enhance(1 + magnitude) + + +def brightness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Brightness(image).enhance(1 + magnitude) + + +def sharpness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Sharpness(image).enhance(1 + magnitude) + diff --git a/image_classification/HaloNet/config.py b/image_classification/HaloNet/config.py index 28f0f8e1..f01d9058 100755 --- a/image_classification/HaloNet/config.py +++ b/image_classification/HaloNet/config.py @@ -65,10 +65,10 @@ # training settings _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 -_C.TRAIN.NUM_EPOCHS = 300 +_C.TRAIN.NUM_EPOCHS = 150 _C.TRAIN.WARMUP_EPOCHS = 3 -_C.TRAIN.WEIGHT_DECAY = 0.01 -_C.TRAIN.BASE_LR = 0.002 +_C.TRAIN.WEIGHT_DECAY = 0.00008 +_C.TRAIN.BASE_LR = 0.1 _C.TRAIN.WARMUP_START_LR = 0.0002 _C.TRAIN.END_LR = 0.0002 _C.TRAIN.GRAD_CLIP = None @@ -99,26 +99,14 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" diff --git a/image_classification/HaloNet/configs/halonet_26t_256.yaml b/image_classification/HaloNet/configs/halonet_26t_256.yaml index 9cd329bd..104afed4 100755 --- a/image_classification/HaloNet/configs/halonet_26t_256.yaml +++ b/image_classification/HaloNet/configs/halonet_26t_256.yaml @@ -4,7 +4,7 @@ DATA: MODEL: TYPE: halo NAME: halonet_26t - PRETRAINED: halonet_26t_256 + #PRETRAINED: halonet_26t_256 ACT: relu BLOCK_SIZE: 8 HALO_SIZE: 2 diff --git a/image_classification/HaloNet/configs/halonet_50ts_256.yaml b/image_classification/HaloNet/configs/halonet_50ts_256.yaml index 96678ccf..acfcbf80 100755 --- a/image_classification/HaloNet/configs/halonet_50ts_256.yaml +++ b/image_classification/HaloNet/configs/halonet_50ts_256.yaml @@ -7,7 +7,7 @@ MODEL: ACT: silu BLOCK_SIZE: 8 HALO_SIZE: 3 - PRETRAINED: halonet_50ts_256 + #PRETRAINED: halonet_50ts_256 STAGE1_BLOCK: ['bottle','bottle','bottle'] STAGE2_BLOCK: ['bottle','bottle','bottle','attn'] STAGE3_BLOCK: ['bottle','attn','bottle','attn','bottle','attn'] diff --git a/image_classification/HaloNet/datasets.py b/image_classification/HaloNet/datasets.py index 761dd61a..1752a66d 100755 --- a/image_classification/HaloNet/datasets.py +++ b/image_classification/HaloNet/datasets.py @@ -19,8 +19,18 @@ import os import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from random_erasing import RandomErasing class ImageNet2012Dataset(Dataset): @@ -81,12 +91,36 @@ def get_train_transforms(config): transforms_train: training transforms """ - transforms_train = transforms.Compose([ + aug_op_list = [] + # STEP1: random crop and resize + aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) + scale=(0.05, 1.0), interpolation='bicubic')) + # STEP2: auto_augment or color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # STEP3: other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # STEP4: random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + # Final: compose transforms and return + transforms_train = transforms.Compose(aug_op_list) return transforms_train @@ -106,7 +140,7 @@ def get_val_transforms(config): scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) transforms_val = transforms.Compose([ - transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image + transforms.Resize(scale_size, interpolation='bicubic'), transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), transforms.ToTensor(), transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), @@ -124,6 +158,7 @@ def get_dataset(config, mode='train'): Returns: dataset: dataset object """ + assert mode in ['train', 'val'] if config.DATA.DATASET == "cifar10": if mode == 'train': diff --git a/image_classification/HaloNet/losses.py b/image_classification/HaloNet/losses.py index f67780a2..082467a3 100644 --- a/image_classification/HaloNet/losses.py +++ b/image_classification/HaloNet/losses.py @@ -119,3 +119,5 @@ def forward(self, inputs, outputs, targets): loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha return loss + + diff --git a/image_classification/HaloNet/random_erasing.py b/image_classification/HaloNet/random_erasing.py new file mode 100644 index 00000000..31eea465 --- /dev/null +++ b/image_classification/HaloNet/random_erasing.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Random Erasing for image tensor""" + +import random +import math +import paddle + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"): + if per_pixel: + return paddle.normal(shape=patch_size).astype(dtype) + if rand_color: + return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype) + return paddle.zeros((patch_size[0], 1, 1)).astype(dtype) + + +class RandomErasing(object): + """ + Args: + prob: probability of performing random erasing + min_area: Minimum percentage of erased area wrt input image area + max_area: Maximum percentage of erased area wrt input image area + min_aspect: Minimum aspect ratio of earsed area + max_aspect: Maximum aspect ratio of earsed area + mode: pixel color mode, in ['const', 'rand', 'pixel'] + 'const' - erase block is constant valued 0 for all channels + 'rand' - erase block is valued random color (same per-channel) + 'pixel' - erase block is vauled random color per pixel + min_count: Minimum # of ereasing blocks per image. + max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count + per-image count is randomly chosen between min_count to max_count + """ + def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0): + self.prob = prob + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == "rand": + self.rand_color = True + elif mode == "pixel": + self.per_pixel = True + else: + assert not mode or mode == "const" + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.prob: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top+h, left:left+w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype) + break + + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) + else: + batch_size, chan, img_h, img_w = input.shape + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input + + + +#def main(): +# re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel') +# import PIL.Image as Image +# import numpy as np +# paddle.set_device('cpu') +# img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32') +# img = img / 255.0 +# img = paddle.transpose(img, [2, 0, 1]) +# new_img = re(img) +# new_img = new_img * 255.0 +# new_img = paddle.transpose(new_img, [1, 2, 0]) +# new_img = new_img.cpu().numpy() +# new_img = Image.fromarray(new_img.astype('uint8')) +# new_img.save('./res.png') +# +# +# +#if __name__ == "__main__": +# main() diff --git a/image_classification/HaloNet/run_train_multi.sh b/image_classification/HaloNet/run_train_multi.sh index 987ef814..54dfa85c 100755 --- a/image_classification/HaloNet/run_train_multi.sh +++ b/image_classification/HaloNet/run_train_multi.sh @@ -4,4 +4,4 @@ python main_multi_gpu.py \ -dataset='imagenet2012' \ -batch_size=16 \ -data_path='/dataset/imagenet' \ - -amp +# -amp diff --git a/image_classification/MAE/augment.py b/image_classification/MAE/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/MAE/augment.py +++ b/image_classification/MAE/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/MAE/datasets.py b/image_classification/MAE/datasets.py index 0877da54..1d6c17d3 100644 --- a/image_classification/MAE/datasets.py +++ b/image_classification/MAE/datasets.py @@ -19,12 +19,20 @@ import os import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment from augment import rand_augment_policy_original from augment import RandAugment from masking_generator import RandomMaskingGenerator - +from transforms import RandomHorizontalFlip +from random_erasing import RandomErasing class ImageNet2012Dataset(Dataset): """Build ImageNet2012 dataset @@ -100,17 +108,34 @@ def get_train_transforms(config): """ aug_op_list = [] - aug_op_list.append(transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0))) - # use RandAug (9, 0.5) only during finetuning - if not config.MODEL.MAE_PRETRAIN: - if config.TRAIN.RAND_AUGMENT: - policy = rand_augment_policy_original(config.TRAIN.RAND_AUGMENT_MAGNITUDE) - rand_augment = RandAugment(policy, config.TRAIN.RAND_AUGMENT_LAYERS) - aug_op_list.append(rand_augment) + # STEP1: random crop and resize + aug_op_list.append( + transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), + scale=(0.05, 1.0), interpolation='bicubic')) + # STEP2: auto_augment or color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD)) + # STEP4: random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + # Final: compose transforms and return transforms_train = transforms.Compose(aug_op_list) if config.MODEL.MAE_PRETRAIN: diff --git a/image_classification/MAE/nohup.out b/image_classification/MAE/nohup.out new file mode 100644 index 00000000..6e00dda7 --- /dev/null +++ b/image_classification/MAE/nohup.out @@ -0,0 +1,9507 @@ +Traceback (most recent call last): + File "main_multi_gpu_pretrain.py", line 24, in + import paddle + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/__init__.py", line 25, in + from .fluid import monkey_patch_variable + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/__init__.py", line 45, in + from . import dataset + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dataset.py", line 19, in + from ..utils import deprecated + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/utils/__init__.py", line 26, in + from . import download # noqa: F401 + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/utils/download.py", line 23, in + import requests + File "/opt/conda/envs/py36/lib/python3.6/site-packages/requests/__init__.py", line 112, in + from . import utils + File "/opt/conda/envs/py36/lib/python3.6/site-packages/requests/utils.py", line 24, in + from . import certs + File "", line 971, in _find_and_load + File "", line 955, in _find_and_load_unlocked + File "", line 665, in _load_unlocked + File "", line 674, in exec_module + File "", line 764, in get_code + File "", line 833, in get_data +KeyboardInterrupt +merging config from ./configs/vit_base_patch16_224_pretrain_dec1.yaml +----- Imagenet2012 image train list len = 1281167 +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:30053', '127.0.0.1:54949', '127.0.0.1:41862', '127.0.0.1:28777', '127.0.0.1:55177', '127.0.0.1:18423', '127.0.0.1:46681'] +I1219 16:59:41.631045 23562 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:30053 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:54949', '127.0.0.1:41862', '127.0.0.1:28777', '127.0.0.1:55177', '127.0.0.1:18423', '127.0.0.1:46681'] +I1219 16:59:44.247634 23580 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:54949 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:41862', '127.0.0.1:28777', '127.0.0.1:55177', '127.0.0.1:18423', '127.0.0.1:46681'] +I1219 16:59:46.636570 23595 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:41862 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:28777', '127.0.0.1:55177', '127.0.0.1:18423', '127.0.0.1:46681'] +I1219 16:59:48.816335 23610 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:28777 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:55177', '127.0.0.1:18423', '127.0.0.1:46681'] +I1219 16:59:51.517431 23627 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:55177 successful. +I1219 16:59:53.801396 23642 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:18423 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:46681'] +I1219 16:59:56.182962 23659 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:46681 successful. +I1219 16:59:56.935767 23580 nccl_context.cc:74] init nccl context nranks: 8 local rank: 2 gpu id: 2 ring id: 0 +I1219 16:59:56.935765 23562 nccl_context.cc:74] init nccl context nranks: 8 local rank: 1 gpu id: 1 ring id: 0 +I1219 16:59:56.935781 23627 nccl_context.cc:74] init nccl context nranks: 8 local rank: 5 gpu id: 5 ring id: 0 +I1219 16:59:56.935775 23595 nccl_context.cc:74] init nccl context nranks: 8 local rank: 3 gpu id: 3 ring id: 0 +I1219 16:59:56.935791 23642 nccl_context.cc:74] init nccl context nranks: 8 local rank: 6 gpu id: 6 ring id: 0 +I1219 16:59:56.935806 23610 nccl_context.cc:74] init nccl context nranks: 8 local rank: 4 gpu id: 4 ring id: 0 +I1219 16:59:56.935818 23659 nccl_context.cc:74] init nccl context nranks: 8 local rank: 7 gpu id: 7 ring id: 0 +I1219 16:59:56.935837 23545 nccl_context.cc:74] init nccl context nranks: 8 local rank: 0 gpu id: 0 ring id: 0 +W1219 17:00:00.904070 23545 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.904078 23562 device_context.cc:447] Please NOTE: device: 1, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.904153 23595 device_context.cc:447] Please NOTE: device: 3, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.904173 23610 device_context.cc:447] Please NOTE: device: 4, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.904186 23659 device_context.cc:447] Please NOTE: device: 7, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.904246 23642 device_context.cc:447] Please NOTE: device: 6, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.904264 23627 device_context.cc:447] Please NOTE: device: 5, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.906248 23580 device_context.cc:447] Please NOTE: device: 2, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:00:00.957355 23562 device_context.cc:465] device: 1, cuDNN Version: 7.6. +W1219 17:00:00.957355 23659 device_context.cc:465] device: 7, cuDNN Version: 7.6. +W1219 17:00:00.957358 23595 device_context.cc:465] device: 3, cuDNN Version: 7.6. +W1219 17:00:00.957360 23545 device_context.cc:465] device: 0, cuDNN Version: 7.6. +W1219 17:00:00.957374 23610 device_context.cc:465] device: 4, cuDNN Version: 7.6. +W1219 17:00:00.957383 23642 device_context.cc:465] device: 6, cuDNN Version: 7.6. +W1219 17:00:00.957394 23580 device_context.cc:465] device: 2, cuDNN Version: 7.6. +W1219 17:00:00.957394 23627 device_context.cc:465] device: 5, cuDNN Version: 7.6. +INFO:local_logger:----- world_size = 8, local_rank = 6 +INFO:local_logger:----- world_size = 8, local_rank = 3 +INFO:master_logger: +AMP: False +BASE: [''] +DATA: + BATCH_SIZE: 256 + BATCH_SIZE_EVAL: 8 + CROP_PCT: 0.875 + DATASET: imagenet2012 + DATA_PATH: /dataset/imagenet + IMAGE_SIZE: 224 + NUM_WORKERS: 4 +EVAL: False +LOCAL_RANK: 0 +MODEL: + ATTENTION_DROPOUT: 0.1 + DROPOUT: 0.1 + DROPPATH: 0.0 + MAE_PRETRAIN: True + NAME: vit_base_patch16_224_dec1 + NUM_CLASSES: 1000 + PRETRAINED: None + RESUME: None + TRANS: + DECODER: + DEPTH: 1 + EMBED_DIM: 512 + NUM_HEADS: 8 + ENCODER: + DEPTH: 12 + EMBED_DIM: 768 + NUM_HEADS: 12 + MASK_RATIO: 0.75 + MLP_RATIO: 4.0 + PATCH_SIZE: 16 + QKV_BIAS: True + TYPE: MAE +NGPUS: 8 +REPORT_FREQ: 100 +SAVE: ./output/train-20211219-16-59-32 +SAVE_FREQ: 1 +SEED: 0 +TAG: default +TRAIN: + ACCUM_ITER: 2 + BASE_LR: 0.00015 + CUTMIX_ALPHA: 1.0 + CUTMIX_MINMAX: None + END_LR: 0.0005 + GRAD_CLIP: 1 + LAST_EPOCH: 0 + LINEAR_SCALED_LR: None + LR_SCHEDULER: + DECAY_EPOCHS: 30 + DECAY_RATE: 0.1 + MILESTONES: 30, 60, 90 + NAME: warmupcosine + MIXUP_ALPHA: 0.8 + MIXUP_MODE: batch + MIXUP_PROB: 1.0 + MIXUP_SWITCH_PROB: 0.5 + NORMALIZE_TARGET: True + NUM_EPOCHS: 800 + OPTIMIZER: + BETAS: (0.9, 0.95) + EPS: 1e-08 + MOMENTUM: 0.9 + NAME: AdamW + RAND_AUGMENT: False + RAND_AUGMENT_LAYERS: 9 + RAND_AUGMENT_MAGNITUDE: 5 + SMOOTHING: 0.1 + WARMUP_EPOCHS: 40 + WARMUP_START_LR: 1e-06 + WEIGHT_DECAY: 0.05 +VALIDATE_FREQ: 100 +INFO:local_logger:----- world_size = 8, local_rank = 0 +INFO:master_logger:----- world_size = 8, local_rank = 0 +INFO:local_logger:----- world_size = 8, local_rank = 7 +INFO:local_logger:----- world_size = 8, local_rank = 5 +INFO:local_logger:----- world_size = 8, local_rank = 1 +INFO:local_logger:----- world_size = 8, local_rank = 2 +INFO:local_logger:----- world_size = 8, local_rank = 4 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:master_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:master_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:master_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +ERROR: Unexpected BUS error encountered in DataLoader worker. This might be caused by insufficient shared memory (shm), please check whether use_shared_memory is set and storage space in /dev/shm is enough +Exception in thread Thread-1: +Traceback (most recent call last): + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 583, in _get_data + data = self._data_queue.get(timeout=self._timeout) + File "/opt/conda/envs/py36/lib/python3.6/multiprocessing/queues.py", line 105, in get + raise Empty +queue.Empty + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/opt/conda/envs/py36/lib/python3.6/threading.py", line 916, in _bootstrap_inner + self.run() + File "/opt/conda/envs/py36/lib/python3.6/threading.py", line 864, in run + self._target(*self._args, **self._kwargs) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 505, in _thread_loop + batch = self._get_data() + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 599, in _get_data + "pids: {}".format(len(failed_workers), pids)) +RuntimeError: DataLoader 1 workers exit unexpectedly, pids: 23832 + + + +-------------------------------------- +C++ Traceback (most recent call last): +-------------------------------------- +No stack trace in paddle, may be caused by external reasons. + +---------------------- +Error Message Summary: +---------------------- +FatalError: `Termination signal` is detected by the operating system. + [TimeInfo: *** Aborted at 1639904442 (unix time) try "date -d @1639904442" if you are using GNU date ***] + [SignalInfo: *** SIGTERM (@0x5be5) received by PID 23545 (TID 0x7f5dda7df700) from PID 23525 ***] + +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 20 leaked semaphores to clean up at shutdown + len(cache)) +Traceback (most recent call last): + File "main_multi_gpu_pretrain.py", line 416, in + main() + File "main_multi_gpu_pretrain.py", line 412, in main + dist.spawn(main_worker, args=(config, dataset_train, ), nprocs=config.NGPUS) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 502, in spawn + while not context.join(): + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 312, in join + self._throw_exception(error_index) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 330, in _throw_exception + raise Exception(msg) +Exception: + +---------------------------------------------- +Process 3 terminated with the following error: +---------------------------------------------- + +Traceback (most recent call last): + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 261, in _func_wrapper + result = func(*args) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/main_multi_gpu_pretrain.py", line 368, in main_worker + master_logger=master_logger) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/main_multi_gpu_pretrain.py", line 157, in train + reconstructed_patches = model(images, masks) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 914, in __call__ + outputs = self.forward(*inputs, **kwargs) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/parallel.py", line 695, in forward + outputs = self._layers(*inputs, **kwargs) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 914, in __call__ + outputs = self.forward(*inputs, **kwargs) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/transformer.py", line 537, in forward + enc_out = self.encoder(no_mask_x) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 914, in __call__ + outputs = self.forward(*inputs, **kwargs) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/transformer.py", line 364, in forward + x = layer(x) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 914, in __call__ + outputs = self.forward(*inputs, **kwargs) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/transformer.py", line 310, in forward + x = self.mlp(x) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 914, in __call__ + outputs = self.forward(*inputs, **kwargs) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/transformer.py", line 245, in forward + x = self.fc1(x) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 914, in __call__ + outputs = self.forward(*inputs, **kwargs) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/nn/layer/common.py", line 172, in forward + x=input, weight=self.weight, bias=self.bias, name=self.name) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/nn/functional/common.py", line 1474, in linear + False) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/multiprocess_utils.py", line 134, in __handler__ + core._throw_error_if_process_failed() +SystemError: (Fatal) DataLoader process (pid 1. If run DataLoader by DataLoader.from_generator(...), queue capacity is set by from_generator(..., capacity=xx, ...). + 2. If run DataLoader by DataLoader(dataset, ...), queue capacity is set as 2 times of the max value of num_workers and len(places). + 3. If run by DataLoader(dataset, ..., use_shared_memory=True), set use_shared_memory=False for not using shared memory.) exited is killed by signal: 23723. + It may be caused by insufficient shared storage space. This problem usually occurs when using docker as a development environment. + Please use command `df -h` to check the storage space of `/dev/shm`. Shared storage space needs to be greater than (DataLoader Num * DataLoader queue capacity * 1 batch data size). + You can solve this problem by increasing the shared storage space or reducing the queue capacity appropriately. +Bus error (at /paddle/paddle/fluid/imperative/data_loader.cc:177) + + +merging config from ./configs/vit_base_patch16_224_pretrain_dec1.yaml +----- Imagenet2012 image train list len = 1281167 +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:58819', '127.0.0.1:34756', '127.0.0.1:44071', '127.0.0.1:12661', '127.0.0.1:44311', '127.0.0.1:14139', '127.0.0.1:51679'] +I1219 17:02:09.309500 24382 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:58819 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:34756', '127.0.0.1:44071', '127.0.0.1:12661', '127.0.0.1:44311', '127.0.0.1:14139', '127.0.0.1:51679'] +I1219 17:02:11.901250 24397 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:34756 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:44071', '127.0.0.1:12661', '127.0.0.1:44311', '127.0.0.1:14139', '127.0.0.1:51679'] +I1219 17:02:14.341609 24414 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:44071 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:12661', '127.0.0.1:44311', '127.0.0.1:14139', '127.0.0.1:51679'] +I1219 17:02:17.001890 24429 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:12661 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:44311', '127.0.0.1:14139', '127.0.0.1:51679'] +I1219 17:02:19.379423 24447 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:44311 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:14139', '127.0.0.1:51679'] +I1219 17:02:22.029084 24463 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:14139 successful. +I1219 17:02:24.569348 24481 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:51679 successful. +I1219 17:02:24.931157 24382 nccl_context.cc:74] init nccl context nranks: 8 local rank: 1 gpu id: 1 ring id: 0 +I1219 17:02:24.931161 24397 nccl_context.cc:74] init nccl context nranks: 8 local rank: 2 gpu id: 2 ring id: 0 +I1219 17:02:24.931192 24414 nccl_context.cc:74] init nccl context nranks: 8 local rank: 3 gpu id: 3 ring id: 0 +I1219 17:02:24.931200 24429 nccl_context.cc:74] init nccl context nranks: 8 local rank: 4 gpu id: 4 ring id: 0 +I1219 17:02:24.931208 24447 nccl_context.cc:74] init nccl context nranks: 8 local rank: 5 gpu id: 5 ring id: 0 +I1219 17:02:24.931213 24463 nccl_context.cc:74] init nccl context nranks: 8 local rank: 6 gpu id: 6 ring id: 0 +I1219 17:02:24.931216 24481 nccl_context.cc:74] init nccl context nranks: 8 local rank: 7 gpu id: 7 ring id: 0 +I1219 17:02:24.931238 24365 nccl_context.cc:74] init nccl context nranks: 8 local rank: 0 gpu id: 0 ring id: 0 +W1219 17:02:28.374552 24365 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.374681 24397 device_context.cc:447] Please NOTE: device: 2, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.374711 24414 device_context.cc:447] Please NOTE: device: 3, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.374712 24429 device_context.cc:447] Please NOTE: device: 4, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.374729 24447 device_context.cc:447] Please NOTE: device: 5, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.374773 24382 device_context.cc:447] Please NOTE: device: 1, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.374810 24463 device_context.cc:447] Please NOTE: device: 6, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.376953 24481 device_context.cc:447] Please NOTE: device: 7, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:02:28.382552 24414 device_context.cc:465] device: 3, cuDNN Version: 7.6. +W1219 17:02:28.382556 24365 device_context.cc:465] device: 0, cuDNN Version: 7.6. +W1219 17:02:28.382561 24447 device_context.cc:465] device: 5, cuDNN Version: 7.6. +W1219 17:02:28.382565 24397 device_context.cc:465] device: 2, cuDNN Version: 7.6. +W1219 17:02:28.382582 24463 device_context.cc:465] device: 6, cuDNN Version: 7.6. +W1219 17:02:28.382568 24429 device_context.cc:465] device: 4, cuDNN Version: 7.6. +W1219 17:02:28.382580 24382 device_context.cc:465] device: 1, cuDNN Version: 7.6. +W1219 17:02:28.382681 24481 device_context.cc:465] device: 7, cuDNN Version: 7.6. +INFO:local_logger:----- world_size = 8, local_rank = 1 +INFO:local_logger:----- world_size = 8, local_rank = 5 +INFO:local_logger:----- world_size = 8, local_rank = 3 +INFO:local_logger:----- world_size = 8, local_rank = 2 +INFO:local_logger:----- world_size = 8, local_rank = 7 +INFO:local_logger:----- world_size = 8, local_rank = 6 +INFO:master_logger: +AMP: False +BASE: [''] +DATA: + BATCH_SIZE: 256 + BATCH_SIZE_EVAL: 8 + CROP_PCT: 0.875 + DATASET: imagenet2012 + DATA_PATH: /dataset/imagenet + IMAGE_SIZE: 224 + NUM_WORKERS: 4 +EVAL: False +LOCAL_RANK: 0 +MODEL: + ATTENTION_DROPOUT: 0.1 + DROPOUT: 0.1 + DROPPATH: 0.0 + MAE_PRETRAIN: True + NAME: vit_base_patch16_224_dec1 + NUM_CLASSES: 1000 + PRETRAINED: None + RESUME: None + TRANS: + DECODER: + DEPTH: 1 + EMBED_DIM: 512 + NUM_HEADS: 8 + ENCODER: + DEPTH: 12 + EMBED_DIM: 768 + NUM_HEADS: 12 + MASK_RATIO: 0.75 + MLP_RATIO: 4.0 + PATCH_SIZE: 16 + QKV_BIAS: True + TYPE: MAE +NGPUS: 8 +REPORT_FREQ: 100 +SAVE: ./output/train-20211219-17-02-00 +SAVE_FREQ: 1 +SEED: 0 +TAG: default +TRAIN: + ACCUM_ITER: 2 + BASE_LR: 0.00015 + CUTMIX_ALPHA: 1.0 + CUTMIX_MINMAX: None + END_LR: 0.0005 + GRAD_CLIP: 1 + LAST_EPOCH: 0 + LINEAR_SCALED_LR: None + LR_SCHEDULER: + DECAY_EPOCHS: 30 + DECAY_RATE: 0.1 + MILESTONES: 30, 60, 90 + NAME: warmupcosine + MIXUP_ALPHA: 0.8 + MIXUP_MODE: batch + MIXUP_PROB: 1.0 + MIXUP_SWITCH_PROB: 0.5 + NORMALIZE_TARGET: True + NUM_EPOCHS: 800 + OPTIMIZER: + BETAS: (0.9, 0.95) + EPS: 1e-08 + MOMENTUM: 0.9 + NAME: AdamW + RAND_AUGMENT: False + RAND_AUGMENT_LAYERS: 9 + RAND_AUGMENT_MAGNITUDE: 5 + SMOOTHING: 0.1 + WARMUP_EPOCHS: 40 + WARMUP_START_LR: 1e-06 + WEIGHT_DECAY: 0.05 +VALIDATE_FREQ: 100 +INFO:local_logger:----- world_size = 8, local_rank = 0 +INFO:master_logger:----- world_size = 8, local_rank = 0 +INFO:local_logger:----- world_size = 8, local_rank = 4 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:master_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:master_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:master_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1452 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1431 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1469 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1481 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1408 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1501 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1475 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1440 +INFO:master_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1457 + + +-------------------------------------- +C++ Traceback (most recent call last): +-------------------------------------- +No stack trace in paddle, may be caused by external reasons. + +---------------------- +Error Message Summary: +---------------------- +FatalError: `Termination signal` is detected by the operating system. + [TimeInfo: *** Aborted at 1639904603 (unix time) try "date -d @1639904603" if you are using GNU date ***] + [SignalInfo: *** SIGTERM (@0x5f17) received by PID 24365 (TID 0x7f5d5ca46700) from PID 24343 ***] + +Traceback (most recent call last): + File "main_multi_gpu_pretrain.py", line 416, in + main() + File "main_multi_gpu_pretrain.py", line 412, in main + dist.spawn(main_worker, args=(config, dataset_train, ), nprocs=config.NGPUS) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 502, in spawn + while not context.join(): + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 312, in join + self._throw_exception(error_index) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 330, in _throw_exception + raise Exception(msg) +Exception: + +---------------------------------------------- +Process 1 terminated with the following error: +---------------------------------------------- + +Traceback (most recent call last): + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 261, in _func_wrapper + result = func(*args) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/main_multi_gpu_pretrain.py", line 368, in main_worker + master_logger=master_logger) + File "/workspace/ppvit_github/PaddleViT_raw/PaddleViT/image_classification/MAE/main_multi_gpu_pretrain.py", line 163, in train + loss.backward() + File "/opt/conda/envs/py36/lib/python3.6/site-packages/decorator.py", line 232, in fun + return caller(func, *(extras + args), **kw) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/wrapped_decorator.py", line 25, in __impl__ + return wrapped_func(*args, **kwargs) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/framework.py", line 229, in __impl__ + return func(*args, **kwargs) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/fluid/dygraph/varbase_patch_methods.py", line 239, in backward + framework._dygraph_tracer()) +OSError: (External) ResourceExhaustedError: + +Out of memory error on GPU 1. Cannot allocate 394.000244MB memory on GPU 1, 15.719788GB memory has been allocated and available memory is only 63.437500MB. + +Please check whether there is any other process using GPU 1. +1. If yes, please stop them, or start PaddlePaddle on another GPU. +2. If no, please decrease the batch size of your model. + + (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:79) + (at /paddle/paddle/fluid/imperative/basic_engine.cc:568) + + +merging config from ./configs/vit_base_patch16_224_pretrain_dec1.yaml +----- Imagenet2012 image train list len = 1281167 +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:45480', '127.0.0.1:58605', '127.0.0.1:23406', '127.0.0.1:16014', '127.0.0.1:60086', '127.0.0.1:60603', '127.0.0.1:46782'] +I1219 17:07:49.286090 25456 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:45480 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:58605', '127.0.0.1:23406', '127.0.0.1:16014', '127.0.0.1:60086', '127.0.0.1:60603', '127.0.0.1:46782'] +I1219 17:07:51.690086 25473 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:58605 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:23406', '127.0.0.1:16014', '127.0.0.1:60086', '127.0.0.1:60603', '127.0.0.1:46782'] +I1219 17:07:54.058967 25488 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:23406 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:16014', '127.0.0.1:60086', '127.0.0.1:60603', '127.0.0.1:46782'] +I1219 17:07:57.064612 25503 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:16014 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:60086', '127.0.0.1:60603', '127.0.0.1:46782'] +I1219 17:07:59.496040 25520 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:60086 successful. +server not ready, wait 3 sec to retry... +not ready endpoints:['127.0.0.1:60603', '127.0.0.1:46782'] +I1219 17:08:02.203279 25537 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:60603 successful. +I1219 17:08:04.597697 25554 gen_comm_id_helper.cc:190] Server listening on: 127.0.0.1:46782 successful. +I1219 17:08:05.017540 25473 nccl_context.cc:74] init nccl context nranks: 8 local rank: 2 gpu id: 2 ring id: 0 +I1219 17:08:05.017537 25456 nccl_context.cc:74] init nccl context nranks: 8 local rank: 1 gpu id: 1 ring id: 0 +I1219 17:08:05.017560 25488 nccl_context.cc:74] init nccl context nranks: 8 local rank: 3 gpu id: 3 ring id: 0 +I1219 17:08:05.017565 25537 nccl_context.cc:74] init nccl context nranks: 8 local rank: 6 gpu id: 6 ring id: 0 +I1219 17:08:05.017578 25503 nccl_context.cc:74] init nccl context nranks: 8 local rank: 4 gpu id: 4 ring id: 0 +I1219 17:08:05.017585 25520 nccl_context.cc:74] init nccl context nranks: 8 local rank: 5 gpu id: 5 ring id: 0 +I1219 17:08:05.017601 25554 nccl_context.cc:74] init nccl context nranks: 8 local rank: 7 gpu id: 7 ring id: 0 +I1219 17:08:05.017613 25441 nccl_context.cc:74] init nccl context nranks: 8 local rank: 0 gpu id: 0 ring id: 0 +W1219 17:08:09.206136 25441 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.206564 25456 device_context.cc:447] Please NOTE: device: 1, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.206579 25554 device_context.cc:447] Please NOTE: device: 7, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.206670 25488 device_context.cc:447] Please NOTE: device: 3, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.206694 25520 device_context.cc:447] Please NOTE: device: 5, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.206728 25503 device_context.cc:447] Please NOTE: device: 4, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.209081 25537 device_context.cc:447] Please NOTE: device: 6, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.209785 25473 device_context.cc:447] Please NOTE: device: 2, GPU Compute Capability: 7.0, Driver API Version: 10.2, Runtime API Version: 10.2 +W1219 17:08:09.212059 25456 device_context.cc:465] device: 1, cuDNN Version: 7.6. +W1219 17:08:09.212066 25554 device_context.cc:465] device: 7, cuDNN Version: 7.6. +W1219 17:08:09.212080 25503 device_context.cc:465] device: 4, cuDNN Version: 7.6. +W1219 17:08:09.212086 25520 device_context.cc:465] device: 5, cuDNN Version: 7.6. +W1219 17:08:09.212086 25488 device_context.cc:465] device: 3, cuDNN Version: 7.6. +W1219 17:08:09.212239 25441 device_context.cc:465] device: 0, cuDNN Version: 7.6. +W1219 17:08:09.213409 25537 device_context.cc:465] device: 6, cuDNN Version: 7.6. +W1219 17:08:09.214195 25473 device_context.cc:465] device: 2, cuDNN Version: 7.6. +INFO:local_logger:----- world_size = 8, local_rank = 4 +INFO:local_logger:----- world_size = 8, local_rank = 1 +INFO:local_logger:----- world_size = 8, local_rank = 2 +INFO:master_logger: +AMP: True +BASE: [''] +DATA: + BATCH_SIZE: 256 + BATCH_SIZE_EVAL: 8 + CROP_PCT: 0.875 + DATASET: imagenet2012 + DATA_PATH: /dataset/imagenet + IMAGE_SIZE: 224 + NUM_WORKERS: 2 +EVAL: False +LOCAL_RANK: 0 +MODEL: + ATTENTION_DROPOUT: 0.0 + DROPOUT: 0.0 + DROPPATH: 0.0 + MAE_PRETRAIN: True + NAME: vit_base_patch16_224_dec1 + NUM_CLASSES: 1000 + PRETRAINED: None + RESUME: None + TRANS: + DECODER: + DEPTH: 1 + EMBED_DIM: 512 + NUM_HEADS: 8 + ENCODER: + DEPTH: 12 + EMBED_DIM: 768 + NUM_HEADS: 12 + MASK_RATIO: 0.75 + MLP_RATIO: 4.0 + PATCH_SIZE: 16 + QKV_BIAS: True + TYPE: MAE +NGPUS: 8 +REPORT_FREQ: 100 +SAVE: ./output/train-20211219-17-07-40 +SAVE_FREQ: 1 +SEED: 0 +TAG: default +TRAIN: + ACCUM_ITER: 2 + BASE_LR: 0.00015 + CUTMIX_ALPHA: 1.0 + CUTMIX_MINMAX: None + END_LR: 0.0005 + GRAD_CLIP: 1 + LAST_EPOCH: 0 + LINEAR_SCALED_LR: None + LR_SCHEDULER: + DECAY_EPOCHS: 30 + DECAY_RATE: 0.1 + MILESTONES: 30, 60, 90 + NAME: warmupcosine + MIXUP_ALPHA: 0.8 + MIXUP_MODE: batch + MIXUP_PROB: 1.0 + MIXUP_SWITCH_PROB: 0.5 + NORMALIZE_TARGET: True + NUM_EPOCHS: 800 + OPTIMIZER: + BETAS: (0.9, 0.95) + EPS: 1e-08 + MOMENTUM: 0.9 + NAME: AdamW + RAND_AUGMENT: False + RAND_AUGMENT_LAYERS: 9 + RAND_AUGMENT_MAGNITUDE: 5 + SMOOTHING: 0.1 + WARMUP_EPOCHS: 40 + WARMUP_START_LR: 1e-06 + WEIGHT_DECAY: 0.05 +VALIDATE_FREQ: 100 +INFO:local_logger:----- world_size = 8, local_rank = 0 +INFO:master_logger:----- world_size = 8, local_rank = 0 +INFO:local_logger:----- world_size = 8, local_rank = 6 +INFO:local_logger:----- world_size = 8, local_rank = 5 +INFO:local_logger:----- world_size = 8, local_rank = 7 +INFO:local_logger:----- world_size = 8, local_rank = 3 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:master_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:master_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:master_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:----- Total # of train batch (single gpu): 626 +INFO:local_logger:Start training from epoch 1. +INFO:local_logger:Now training epoch 1. LR=0.000005 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1468 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1446 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1495 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1428 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1450 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1461 +INFO:master_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1454 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1459 +INFO:local_logger:Epoch[001/800], Step[0000/0626], Avg Loss: 1.1427 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1136 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1140 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1137 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1132 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1132 +INFO:master_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1136 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1135 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1138 +INFO:local_logger:Epoch[001/800], Step[0100/0626], Avg Loss: 1.1139 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0903 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0904 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0904 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0908 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0903 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0900 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0904 +INFO:local_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0902 +INFO:master_logger:Epoch[001/800], Step[0200/0626], Avg Loss: 1.0904 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0723 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0717 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0718 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0716 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0719 +INFO:master_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0719 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0718 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0720 +INFO:local_logger:Epoch[001/800], Step[0300/0626], Avg Loss: 1.0720 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0576 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0572 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0572 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0570 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0573 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0570 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0573 +INFO:master_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0572 +INFO:local_logger:Epoch[001/800], Step[0400/0626], Avg Loss: 1.0574 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0461 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0459 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0459 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0461 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0457 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0461 +INFO:master_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0460 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0463 +INFO:local_logger:Epoch[001/800], Step[0500/0626], Avg Loss: 1.0461 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0374 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0374 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0375 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0375 +INFO:master_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0375 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0372 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0377 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0379 +INFO:local_logger:Epoch[001/800], Step[0600/0626], Avg Loss: 1.0374 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0359, time: 934.80 +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0356, time: 934.81 +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0354, time: 934.86 +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0361, time: 934.98 +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0358, time: 935.03 +INFO:master_logger:----- Epoch[001/800], Train Loss: 1.0357, time: 935.03 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0358, time: 935.07 +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0356, time: 935.07 +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:----- Epoch[001/800], Train Loss: 1.0357, time: 935.09 +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-1-Loss-1.0357822933105671.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-1-Loss-1.0357822933105671.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-1-Loss-1.0357822933105671.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-1-Loss-1.0357822933105671.pdopt +INFO:local_logger:Now training epoch 2. LR=0.000008 +INFO:master_logger:Now training epoch 2. LR=0.000008 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9953 +INFO:master_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9905 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9836 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9941 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9887 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9872 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9919 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9949 +INFO:local_logger:Epoch[002/800], Step[0000/0626], Avg Loss: 0.9885 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9896 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9894 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9900 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9895 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9901 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9887 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9897 +INFO:master_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9896 +INFO:local_logger:Epoch[002/800], Step[0100/0626], Avg Loss: 0.9900 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9880 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9889 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9887 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9883 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9887 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9887 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9883 +INFO:master_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9885 +INFO:local_logger:Epoch[002/800], Step[0200/0626], Avg Loss: 0.9883 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9878 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9874 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9873 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9875 +INFO:master_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9876 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9877 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9880 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9878 +INFO:local_logger:Epoch[002/800], Step[0300/0626], Avg Loss: 0.9872 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9872 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9870 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9867 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9867 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9870 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9871 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9870 +INFO:local_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9868 +INFO:master_logger:Epoch[002/800], Step[0400/0626], Avg Loss: 0.9869 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9862 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9865 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9861 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9864 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9863 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9861 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9862 +INFO:local_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9863 +INFO:master_logger:Epoch[002/800], Step[0500/0626], Avg Loss: 0.9863 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9856 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9858 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9858 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9855 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9855 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9856 +INFO:master_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9856 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9856 +INFO:local_logger:Epoch[002/800], Step[0600/0626], Avg Loss: 0.9856 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9857, time: 891.36 +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9855, time: 891.28 +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9853, time: 891.70 +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9855, time: 891.46 +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9853, time: 891.66 +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9855, time: 891.47 +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9857, time: 891.56 +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:----- Epoch[002/800], Train Loss: 0.9854, time: 887.62 +INFO:master_logger:----- Epoch[002/800], Train Loss: 0.9855, time: 887.62 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-2-Loss-0.9854484576284688.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-2-Loss-0.9854484576284688.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-2-Loss-0.9854484576284688.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-2-Loss-0.9854484576284688.pdopt +INFO:local_logger:Now training epoch 3. LR=0.000012 +INFO:master_logger:Now training epoch 3. LR=0.000012 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9859 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9784 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9751 +INFO:master_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9809 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9834 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9795 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9809 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9833 +INFO:local_logger:Epoch[003/800], Step[0000/0626], Avg Loss: 0.9810 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9816 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9810 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9814 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9810 +INFO:master_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9813 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9813 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9814 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9813 +INFO:local_logger:Epoch[003/800], Step[0100/0626], Avg Loss: 0.9814 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9807 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9808 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9808 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9806 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9806 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9804 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9804 +INFO:local_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9804 +INFO:master_logger:Epoch[003/800], Step[0200/0626], Avg Loss: 0.9806 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9797 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9799 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9799 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9802 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9797 +INFO:master_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9799 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9798 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9799 +INFO:local_logger:Epoch[003/800], Step[0300/0626], Avg Loss: 0.9798 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9791 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9790 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9793 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9789 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9789 +INFO:master_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9790 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9789 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9789 +INFO:local_logger:Epoch[003/800], Step[0400/0626], Avg Loss: 0.9791 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9780 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9782 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9782 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9783 +INFO:master_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9782 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9781 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9786 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9783 +INFO:local_logger:Epoch[003/800], Step[0500/0626], Avg Loss: 0.9781 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9776 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9776 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9774 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9774 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9778 +INFO:master_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9775 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9774 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9773 +INFO:local_logger:Epoch[003/800], Step[0600/0626], Avg Loss: 0.9773 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9774, time: 893.09 +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9772, time: 893.23 +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9776, time: 893.27 +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9771, time: 893.31 +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9772, time: 893.74 +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9772, time: 889.63 +INFO:master_logger:----- Epoch[003/800], Train Loss: 0.9773, time: 889.63 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9773, time: 893.40 +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:----- Epoch[003/800], Train Loss: 0.9775, time: 893.56 +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-3-Loss-0.9772286424963117.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-3-Loss-0.9772286424963117.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-3-Loss-0.9772286424963117.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-3-Loss-0.9772286424963117.pdopt +INFO:local_logger:Now training epoch 4. LR=0.000016 +INFO:master_logger:Now training epoch 4. LR=0.000016 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9778 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9751 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9713 +INFO:master_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9734 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9753 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9753 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9704 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9683 +INFO:local_logger:Epoch[004/800], Step[0000/0626], Avg Loss: 0.9740 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9727 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9724 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9730 +INFO:master_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9728 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9731 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9730 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9729 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9730 +INFO:local_logger:Epoch[004/800], Step[0100/0626], Avg Loss: 0.9726 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9724 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9725 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9721 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9721 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9721 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9720 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9722 +INFO:local_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9724 +INFO:master_logger:Epoch[004/800], Step[0200/0626], Avg Loss: 0.9722 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9715 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9717 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9717 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9720 +INFO:master_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9717 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9712 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9718 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9718 +INFO:local_logger:Epoch[004/800], Step[0300/0626], Avg Loss: 0.9716 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9712 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9711 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9711 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9715 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9712 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9709 +INFO:master_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9712 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9714 +INFO:local_logger:Epoch[004/800], Step[0400/0626], Avg Loss: 0.9714 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9707 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9706 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9709 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9709 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9707 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9708 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9705 +INFO:master_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9707 +INFO:local_logger:Epoch[004/800], Step[0500/0626], Avg Loss: 0.9706 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9701 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9704 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9703 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9701 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9703 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9701 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9704 +INFO:master_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9703 +INFO:local_logger:Epoch[004/800], Step[0600/0626], Avg Loss: 0.9704 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9702, time: 854.73 +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9703, time: 851.06 +INFO:master_logger:----- Epoch[004/800], Train Loss: 0.9702, time: 851.06 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9703, time: 854.82 +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9700, time: 855.11 +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9700, time: 855.36 +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9703, time: 855.48 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9700, time: 855.31 +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:----- Epoch[004/800], Train Loss: 0.9702, time: 855.19 +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-4-Loss-0.97028241060033.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-4-Loss-0.97028241060033.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-4-Loss-0.97028241060033.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-4-Loss-0.97028241060033.pdopt +INFO:local_logger:Now training epoch 5. LR=0.000020 +INFO:master_logger:Now training epoch 5. LR=0.000020 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9655 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9667 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9651 +INFO:master_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9667 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9671 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9619 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9712 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9685 +INFO:local_logger:Epoch[005/800], Step[0000/0626], Avg Loss: 0.9674 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9675 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9674 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9672 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9682 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9673 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9671 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9679 +INFO:master_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9675 +INFO:local_logger:Epoch[005/800], Step[0100/0626], Avg Loss: 0.9672 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9670 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9665 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9669 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9669 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9666 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9673 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9672 +INFO:local_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9671 +INFO:master_logger:Epoch[005/800], Step[0200/0626], Avg Loss: 0.9669 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9661 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9663 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9665 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9665 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9664 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9667 +INFO:master_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9665 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9668 +INFO:local_logger:Epoch[005/800], Step[0300/0626], Avg Loss: 0.9665 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9661 +INFO:master_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9660 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9662 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9660 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9661 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9658 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9660 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9658 +INFO:local_logger:Epoch[005/800], Step[0400/0626], Avg Loss: 0.9660 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9655 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9655 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9657 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9657 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9656 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9656 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9657 +INFO:master_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9656 +INFO:local_logger:Epoch[005/800], Step[0500/0626], Avg Loss: 0.9654 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9651 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9653 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9653 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9654 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9652 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9652 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9649 +INFO:master_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9652 +INFO:local_logger:Epoch[005/800], Step[0600/0626], Avg Loss: 0.9651 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9648, time: 889.02 +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9651, time: 889.10 +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9652, time: 889.53 +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9652, time: 885.85 +INFO:master_logger:----- Epoch[005/800], Train Loss: 0.9651, time: 885.85 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9651, time: 889.20 +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9650, time: 889.56 +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9650, time: 889.67 +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:----- Epoch[005/800], Train Loss: 0.9653, time: 890.15 +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-5-Loss-0.9652042168475674.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-5-Loss-0.9652042168475674.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-5-Loss-0.9652042168475674.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-5-Loss-0.9652042168475674.pdopt +INFO:local_logger:Now training epoch 6. LR=0.000023 +INFO:master_logger:Now training epoch 6. LR=0.000023 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9660 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9643 +INFO:master_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9604 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9476 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9637 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9585 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9542 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9641 +INFO:local_logger:Epoch[006/800], Step[0000/0626], Avg Loss: 0.9651 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9622 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9624 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9627 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9625 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9626 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9626 +INFO:master_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9626 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9627 +INFO:local_logger:Epoch[006/800], Step[0100/0626], Avg Loss: 0.9632 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9624 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9619 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9619 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9624 +INFO:master_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9622 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9620 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9622 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9624 +INFO:local_logger:Epoch[006/800], Step[0200/0626], Avg Loss: 0.9620 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9613 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9620 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9620 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9620 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9615 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9618 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9621 +INFO:master_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9618 +INFO:local_logger:Epoch[006/800], Step[0300/0626], Avg Loss: 0.9617 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9610 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9615 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9612 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9614 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9614 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9612 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9616 +INFO:master_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9613 +INFO:local_logger:Epoch[006/800], Step[0400/0626], Avg Loss: 0.9614 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9609 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9609 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9612 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9608 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9611 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9608 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9610 +INFO:master_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9610 +INFO:local_logger:Epoch[006/800], Step[0500/0626], Avg Loss: 0.9612 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9604 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9608 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9606 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9607 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9605 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9606 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9603 +INFO:local_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9609 +INFO:master_logger:Epoch[006/800], Step[0600/0626], Avg Loss: 0.9606 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9605, time: 860.53 +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9607, time: 860.72 +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9604, time: 857.72 +INFO:master_logger:----- Epoch[006/800], Train Loss: 0.9605, time: 857.72 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9607, time: 861.65 +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9602, time: 861.47 +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9603, time: 861.13 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9608, time: 861.53 +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:----- Epoch[006/800], Train Loss: 0.9603, time: 861.59 +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-6-Loss-0.9604088297024008.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-6-Loss-0.9604088297024008.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-6-Loss-0.9604088297024008.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-6-Loss-0.9604088297024008.pdopt +INFO:local_logger:Now training epoch 7. LR=0.000027 +INFO:master_logger:Now training epoch 7. LR=0.000027 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9534 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9591 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9552 +INFO:master_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9581 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9540 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9572 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9591 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9645 +INFO:local_logger:Epoch[007/800], Step[0000/0626], Avg Loss: 0.9624 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9583 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9576 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9586 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9575 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9582 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9584 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9589 +INFO:master_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9582 +INFO:local_logger:Epoch[007/800], Step[0100/0626], Avg Loss: 0.9584 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9580 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9575 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9573 +INFO:master_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9578 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9580 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9581 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9578 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9578 +INFO:local_logger:Epoch[007/800], Step[0200/0626], Avg Loss: 0.9577 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9571 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9570 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9575 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9573 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9570 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9574 +INFO:master_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9573 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9574 +INFO:local_logger:Epoch[007/800], Step[0300/0626], Avg Loss: 0.9577 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9566 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9566 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9567 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9572 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9568 +INFO:master_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9568 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9568 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9570 +INFO:local_logger:Epoch[007/800], Step[0400/0626], Avg Loss: 0.9568 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9563 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9568 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9561 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9565 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9565 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9563 +INFO:master_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9564 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9565 +INFO:local_logger:Epoch[007/800], Step[0500/0626], Avg Loss: 0.9563 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9564 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9561 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9561 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9559 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9559 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9558 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9558 +INFO:master_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9560 +INFO:local_logger:Epoch[007/800], Step[0600/0626], Avg Loss: 0.9561 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9560, time: 889.20 +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9558, time: 888.65 +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9557, time: 889.07 +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9563, time: 888.69 +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9559, time: 888.70 +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9558, time: 888.74 +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9557, time: 885.04 +INFO:local_logger:----- Epoch[007/800], Train Loss: 0.9560, time: 888.76 +INFO:master_logger:----- Epoch[007/800], Train Loss: 0.9559, time: 885.04 +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-7-Loss-0.9557424400537671.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-7-Loss-0.9557424400537671.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-7-Loss-0.9557424400537671.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-7-Loss-0.9557424400537671.pdopt +INFO:local_logger:Now training epoch 8. LR=0.000031 +INFO:master_logger:Now training epoch 8. LR=0.000031 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9562 +INFO:master_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9506 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9529 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9443 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9491 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9499 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9539 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9524 +INFO:local_logger:Epoch[008/800], Step[0000/0626], Avg Loss: 0.9463 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9530 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9530 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9531 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9531 +INFO:master_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9532 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9528 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9532 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9535 +INFO:local_logger:Epoch[008/800], Step[0100/0626], Avg Loss: 0.9540 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9527 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9531 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9526 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9526 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9525 +INFO:master_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9527 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9528 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9529 +INFO:local_logger:Epoch[008/800], Step[0200/0626], Avg Loss: 0.9524 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9524 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9520 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9521 +INFO:master_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9523 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9524 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9526 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9520 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9521 +INFO:local_logger:Epoch[008/800], Step[0300/0626], Avg Loss: 0.9525 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9517 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9518 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9516 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9519 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9516 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9515 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9518 +INFO:master_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9517 +INFO:local_logger:Epoch[008/800], Step[0400/0626], Avg Loss: 0.9518 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9511 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9511 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9513 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9511 +INFO:master_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9512 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9512 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9513 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9511 +INFO:local_logger:Epoch[008/800], Step[0500/0626], Avg Loss: 0.9512 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9506 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9506 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9506 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9508 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9505 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9506 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9509 +INFO:master_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9506 +INFO:local_logger:Epoch[008/800], Step[0600/0626], Avg Loss: 0.9506 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9505, time: 854.97 +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9507, time: 855.87 +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9506, time: 855.95 +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9504, time: 855.93 +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9504, time: 852.20 +INFO:master_logger:----- Epoch[008/800], Train Loss: 0.9505, time: 852.20 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9504, time: 855.94 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9504, time: 855.86 +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:----- Epoch[008/800], Train Loss: 0.9506, time: 855.86 +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-8-Loss-0.950418085337367.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-8-Loss-0.950418085337367.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-8-Loss-0.950418085337367.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-8-Loss-0.950418085337367.pdopt +INFO:local_logger:Now training epoch 9. LR=0.000035 +INFO:master_logger:Now training epoch 9. LR=0.000035 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9532 +INFO:master_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9494 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9472 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9535 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9457 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9521 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9484 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9460 +INFO:local_logger:Epoch[009/800], Step[0000/0626], Avg Loss: 0.9495 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9469 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9469 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9473 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9469 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9459 +INFO:master_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9466 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9459 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9465 +INFO:local_logger:Epoch[009/800], Step[0100/0626], Avg Loss: 0.9465 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9466 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9460 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9461 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9462 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9455 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9466 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9460 +INFO:local_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9465 +INFO:master_logger:Epoch[009/800], Step[0200/0626], Avg Loss: 0.9462 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9455 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9455 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9450 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9451 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9449 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9455 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9452 +INFO:local_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9448 +INFO:master_logger:Epoch[009/800], Step[0300/0626], Avg Loss: 0.9452 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9441 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9447 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9441 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9444 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9444 +INFO:master_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9444 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9445 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9446 +INFO:local_logger:Epoch[009/800], Step[0400/0626], Avg Loss: 0.9442 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9437 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9432 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9436 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9435 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9434 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9434 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9434 +INFO:master_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9435 +INFO:local_logger:Epoch[009/800], Step[0500/0626], Avg Loss: 0.9437 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9426 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9427 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9428 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9423 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9427 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9421 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9426 +INFO:local_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9427 +INFO:master_logger:Epoch[009/800], Step[0600/0626], Avg Loss: 0.9426 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9425, time: 891.29 +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9425, time: 886.67 +INFO:master_logger:----- Epoch[009/800], Train Loss: 0.9424, time: 886.67 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9420, time: 891.03 +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9425, time: 891.03 +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9421, time: 891.05 +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9424, time: 891.03 +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9426, time: 891.05 +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:----- Epoch[009/800], Train Loss: 0.9425, time: 891.04 +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-9-Loss-0.9425096387053156.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-9-Loss-0.9425096387053156.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-9-Loss-0.9425096387053156.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-9-Loss-0.9425096387053156.pdopt +INFO:local_logger:Now training epoch 10. LR=0.000038 +INFO:master_logger:Now training epoch 10. LR=0.000038 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9389 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9403 +INFO:master_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9385 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9304 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9343 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9450 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9331 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9421 +INFO:local_logger:Epoch[010/800], Step[0000/0626], Avg Loss: 0.9438 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9362 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9361 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9356 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9360 +INFO:master_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9362 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9362 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9361 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9361 +INFO:local_logger:Epoch[010/800], Step[0100/0626], Avg Loss: 0.9371 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9354 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9358 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9355 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9355 +INFO:master_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9357 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9360 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9355 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9359 +INFO:local_logger:Epoch[010/800], Step[0200/0626], Avg Loss: 0.9358 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9345 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9350 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9346 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9345 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9348 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9348 +INFO:master_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9348 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9348 +INFO:local_logger:Epoch[010/800], Step[0300/0626], Avg Loss: 0.9350 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9337 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9337 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9336 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9339 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9340 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9340 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9340 +INFO:master_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9338 +INFO:local_logger:Epoch[010/800], Step[0400/0626], Avg Loss: 0.9337 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9330 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9327 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9328 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9328 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9330 +INFO:master_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9329 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9326 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9329 +INFO:local_logger:Epoch[010/800], Step[0500/0626], Avg Loss: 0.9330 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9320 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9320 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9323 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9321 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9322 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9321 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9320 +INFO:local_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9321 +INFO:master_logger:Epoch[010/800], Step[0600/0626], Avg Loss: 0.9321 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9317, time: 857.40 +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9320, time: 857.41 +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9318, time: 854.67 +INFO:master_logger:----- Epoch[010/800], Train Loss: 0.9318, time: 854.67 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9318, time: 858.49 +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9319, time: 857.80 +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9319, time: 857.83 +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9319, time: 857.81 +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:----- Epoch[010/800], Train Loss: 0.9318, time: 857.82 +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-10-Loss-0.9318290638491608.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-10-Loss-0.9318290638491608.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-10-Loss-0.9318290638491608.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-10-Loss-0.9318290638491608.pdopt +INFO:local_logger:Now training epoch 11. LR=0.000042 +INFO:master_logger:Now training epoch 11. LR=0.000042 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9246 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9293 +INFO:master_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9253 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9166 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9227 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9325 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9194 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9280 +INFO:local_logger:Epoch[011/800], Step[0000/0626], Avg Loss: 0.9296 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9257 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9242 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9247 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9260 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9256 +INFO:master_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9254 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9255 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9251 +INFO:local_logger:Epoch[011/800], Step[0100/0626], Avg Loss: 0.9261 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9257 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9247 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9255 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9252 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9255 +INFO:master_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9252 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9245 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9253 +INFO:local_logger:Epoch[011/800], Step[0200/0626], Avg Loss: 0.9256 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9241 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9237 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9244 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9244 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9246 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9238 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9242 +INFO:local_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9241 +INFO:master_logger:Epoch[011/800], Step[0300/0626], Avg Loss: 0.9242 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9234 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9229 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9234 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9229 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9234 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9233 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9233 +INFO:master_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9232 +INFO:local_logger:Epoch[011/800], Step[0400/0626], Avg Loss: 0.9235 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9224 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9222 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9225 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9217 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9223 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9222 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9219 +INFO:master_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9222 +INFO:local_logger:Epoch[011/800], Step[0500/0626], Avg Loss: 0.9223 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9214 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9207 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9211 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9211 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9210 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9212 +INFO:master_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9211 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9212 +INFO:local_logger:Epoch[011/800], Step[0600/0626], Avg Loss: 0.9213 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9209, time: 888.60 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9210, time: 888.60 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9208, time: 889.00 +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9209, time: 888.65 +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9209, time: 884.67 +INFO:master_logger:----- Epoch[011/800], Train Loss: 0.9209, time: 884.67 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9205, time: 888.70 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9211, time: 889.09 +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:----- Epoch[011/800], Train Loss: 0.9209, time: 888.68 +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-11-Loss-0.9209032249693648.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-11-Loss-0.9209032249693648.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-11-Loss-0.9209032249693648.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-11-Loss-0.9209032249693648.pdopt +INFO:local_logger:Now training epoch 12. LR=0.000046 +INFO:master_logger:Now training epoch 12. LR=0.000046 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9086 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9130 +INFO:master_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9125 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9171 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9156 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9156 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9171 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9090 +INFO:local_logger:Epoch[012/800], Step[0000/0626], Avg Loss: 0.9038 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9149 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9150 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9146 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9152 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9145 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9153 +INFO:master_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9149 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9149 +INFO:local_logger:Epoch[012/800], Step[0100/0626], Avg Loss: 0.9149 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9144 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9141 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9138 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9139 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9143 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9141 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9142 +INFO:master_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9142 +INFO:local_logger:Epoch[012/800], Step[0200/0626], Avg Loss: 0.9145 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9128 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9132 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9126 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9129 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9133 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9131 +INFO:master_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9130 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9127 +INFO:local_logger:Epoch[012/800], Step[0300/0626], Avg Loss: 0.9132 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9121 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9115 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9118 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9120 +INFO:master_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9119 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9117 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9118 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9119 +INFO:local_logger:Epoch[012/800], Step[0400/0626], Avg Loss: 0.9121 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9113 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9111 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9111 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9108 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9108 +INFO:master_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9111 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9111 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9113 +INFO:local_logger:Epoch[012/800], Step[0500/0626], Avg Loss: 0.9112 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9103 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9101 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9102 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9105 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9103 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9103 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9101 +INFO:local_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9099 +INFO:master_logger:Epoch[012/800], Step[0600/0626], Avg Loss: 0.9102 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9102, time: 850.59 +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9099, time: 850.55 +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9104, time: 851.02 +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9099, time: 851.10 +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9100, time: 851.10 +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9097, time: 847.34 +INFO:master_logger:----- Epoch[012/800], Train Loss: 0.9101, time: 847.34 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9101, time: 851.03 +INFO:local_logger:----- Epoch[012/800], Train Loss: 0.9101, time: 851.05 +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-12-Loss-0.9097320030754859.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-12-Loss-0.9097320030754859.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-12-Loss-0.9097320030754859.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-12-Loss-0.9097320030754859.pdopt +INFO:local_logger:Now training epoch 13. LR=0.000049 +INFO:master_logger:Now training epoch 13. LR=0.000049 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9093 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9118 +INFO:master_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9072 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9034 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9130 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9112 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9077 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.9036 +INFO:local_logger:Epoch[013/800], Step[0000/0626], Avg Loss: 0.8972 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9039 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9045 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9040 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9050 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9046 +INFO:master_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9044 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9047 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9041 +INFO:local_logger:Epoch[013/800], Step[0100/0626], Avg Loss: 0.9043 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9035 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9038 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9041 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9039 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9040 +INFO:master_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9039 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9040 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9039 +INFO:local_logger:Epoch[013/800], Step[0200/0626], Avg Loss: 0.9040 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9027 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9024 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9030 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9027 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9029 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9032 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9029 +INFO:master_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9028 +INFO:local_logger:Epoch[013/800], Step[0300/0626], Avg Loss: 0.9029 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9021 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9018 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9023 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9018 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9019 +INFO:master_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9019 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9015 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9020 +INFO:local_logger:Epoch[013/800], Step[0400/0626], Avg Loss: 0.9016 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9012 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9014 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9018 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9013 +INFO:master_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9013 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9014 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9011 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9008 +INFO:local_logger:Epoch[013/800], Step[0500/0626], Avg Loss: 0.9010 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9002 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9003 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9009 +INFO:master_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9003 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.8999 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9002 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9001 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9003 +INFO:local_logger:Epoch[013/800], Step[0600/0626], Avg Loss: 0.9006 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.9000, time: 883.21 +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.8998, time: 883.55 +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.9000, time: 879.83 +INFO:master_logger:----- Epoch[013/800], Train Loss: 0.9000, time: 879.83 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.8996, time: 883.81 +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.9003, time: 884.67 +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.8999, time: 884.16 +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.9005, time: 884.17 +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:----- Epoch[013/800], Train Loss: 0.8999, time: 884.64 +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-13-Loss-0.9000374903566999.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-13-Loss-0.9000374903566999.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-13-Loss-0.9000374903566999.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-13-Loss-0.9000374903566999.pdopt +INFO:local_logger:Now training epoch 14. LR=0.000053 +INFO:master_logger:Now training epoch 14. LR=0.000053 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8904 +INFO:master_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8921 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8961 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8964 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8893 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8854 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8944 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8877 +INFO:local_logger:Epoch[014/800], Step[0000/0626], Avg Loss: 0.8971 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8953 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8955 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8938 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8954 +INFO:master_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8951 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8947 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8947 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8950 +INFO:local_logger:Epoch[014/800], Step[0100/0626], Avg Loss: 0.8962 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8930 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8934 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8928 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8927 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8928 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8934 +INFO:master_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8931 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8934 +INFO:local_logger:Epoch[014/800], Step[0200/0626], Avg Loss: 0.8931 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8927 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8919 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8921 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8920 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8926 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8924 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8919 +INFO:master_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8922 +INFO:local_logger:Epoch[014/800], Step[0300/0626], Avg Loss: 0.8920 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8909 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8914 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8908 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8909 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8912 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8910 +INFO:master_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8910 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8906 +INFO:local_logger:Epoch[014/800], Step[0400/0626], Avg Loss: 0.8914 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8903 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8904 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8906 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8900 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8906 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8902 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8901 +INFO:local_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8902 +INFO:master_logger:Epoch[014/800], Step[0500/0626], Avg Loss: 0.8903 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8896 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8898 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8896 +INFO:master_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8896 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8897 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8893 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8897 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8893 +INFO:local_logger:Epoch[014/800], Step[0600/0626], Avg Loss: 0.8894 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8895, time: 845.39 +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8895, time: 845.75 +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8892, time: 846.69 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8894, time: 846.08 +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8891, time: 847.03 +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8890, time: 846.07 +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8893, time: 842.90 +INFO:master_logger:----- Epoch[014/800], Train Loss: 0.8893, time: 842.90 +INFO:local_logger:----- Epoch[014/800], Train Loss: 0.8894, time: 846.07 +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-14-Loss-0.8892871914493445.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-14-Loss-0.8892871914493445.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-14-Loss-0.8892871914493445.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-14-Loss-0.8892871914493445.pdopt +INFO:local_logger:Now training epoch 15. LR=0.000057 +INFO:master_logger:Now training epoch 15. LR=0.000057 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8662 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8831 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8842 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8880 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8864 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8846 +INFO:master_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8812 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8773 +INFO:local_logger:Epoch[015/800], Step[0000/0626], Avg Loss: 0.8795 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8853 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8863 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8867 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8861 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8861 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8862 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8860 +INFO:master_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8860 +INFO:local_logger:Epoch[015/800], Step[0100/0626], Avg Loss: 0.8851 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8851 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8849 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8845 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8847 +INFO:master_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8846 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8841 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8841 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8847 +INFO:local_logger:Epoch[015/800], Step[0200/0626], Avg Loss: 0.8845 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8832 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8831 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8833 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8827 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8833 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8833 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8830 +INFO:master_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8831 +INFO:local_logger:Epoch[015/800], Step[0300/0626], Avg Loss: 0.8826 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8827 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8824 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8825 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8824 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8827 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8820 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8828 +INFO:local_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8819 +INFO:master_logger:Epoch[015/800], Step[0400/0626], Avg Loss: 0.8824 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8819 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8813 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8812 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8818 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8816 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8820 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8815 +INFO:master_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8816 +INFO:local_logger:Epoch[015/800], Step[0500/0626], Avg Loss: 0.8818 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8808 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8805 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8807 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8807 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8804 +INFO:master_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8806 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8808 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8804 +INFO:local_logger:Epoch[015/800], Step[0600/0626], Avg Loss: 0.8809 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8805, time: 897.37 +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8805, time: 893.90 +INFO:master_logger:----- Epoch[015/800], Train Loss: 0.8804, time: 893.90 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8805, time: 898.09 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8803, time: 898.09 +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8802, time: 898.08 +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8802, time: 898.09 +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8807, time: 898.77 +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:----- Epoch[015/800], Train Loss: 0.8806, time: 898.79 +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-15-Loss-0.8804958925234925.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-15-Loss-0.8804958925234925.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-15-Loss-0.8804958925234925.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-15-Loss-0.8804958925234925.pdopt +INFO:local_logger:Now training epoch 16. LR=0.000061 +INFO:master_logger:Now training epoch 16. LR=0.000061 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8772 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8776 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8818 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8756 +INFO:master_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8774 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8834 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8802 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8772 +INFO:local_logger:Epoch[016/800], Step[0000/0626], Avg Loss: 0.8659 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8729 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8735 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8742 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8743 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8738 +INFO:master_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8738 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8741 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8747 +INFO:local_logger:Epoch[016/800], Step[0100/0626], Avg Loss: 0.8731 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8724 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8723 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8727 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8726 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8731 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8732 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8728 +INFO:master_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8728 +INFO:local_logger:Epoch[016/800], Step[0200/0626], Avg Loss: 0.8732 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8718 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8723 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8717 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8718 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8718 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8720 +INFO:master_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8719 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8722 +INFO:local_logger:Epoch[016/800], Step[0300/0626], Avg Loss: 0.8717 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8710 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8710 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8710 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8713 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8712 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8717 +INFO:master_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8712 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8713 +INFO:local_logger:Epoch[016/800], Step[0400/0626], Avg Loss: 0.8713 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8707 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8706 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8710 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8707 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8709 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8706 +INFO:master_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8707 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8705 +INFO:local_logger:Epoch[016/800], Step[0500/0626], Avg Loss: 0.8709 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8696 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8697 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8697 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8697 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8696 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8700 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8701 +INFO:master_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8698 +INFO:local_logger:Epoch[016/800], Step[0600/0626], Avg Loss: 0.8700 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8695, time: 861.71 +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8693, time: 862.54 +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8699, time: 862.86 +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8695, time: 863.58 +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8695, time: 862.86 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8698, time: 862.85 +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8698, time: 862.87 +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:----- Epoch[016/800], Train Loss: 0.8694, time: 859.59 +INFO:master_logger:----- Epoch[016/800], Train Loss: 0.8696, time: 859.59 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-16-Loss-0.8694310493630203.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-16-Loss-0.8694310493630203.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-16-Loss-0.8694310493630203.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-16-Loss-0.8694310493630203.pdopt +INFO:local_logger:Now training epoch 17. LR=0.000064 +INFO:master_logger:Now training epoch 17. LR=0.000064 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8663 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8709 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8677 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8526 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8679 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8632 +INFO:master_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8658 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8670 +INFO:local_logger:Epoch[017/800], Step[0000/0626], Avg Loss: 0.8705 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8666 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8669 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8674 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8667 +INFO:master_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8668 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8662 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8672 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8665 +INFO:local_logger:Epoch[017/800], Step[0100/0626], Avg Loss: 0.8673 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8654 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8660 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8658 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8659 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8654 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8658 +INFO:master_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8657 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8656 +INFO:local_logger:Epoch[017/800], Step[0200/0626], Avg Loss: 0.8654 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8647 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8648 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8646 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8649 +INFO:master_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8647 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8652 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8646 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8642 +INFO:local_logger:Epoch[017/800], Step[0300/0626], Avg Loss: 0.8648 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8629 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8639 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8636 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8635 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8634 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8634 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8635 +INFO:local_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8633 +INFO:master_logger:Epoch[017/800], Step[0400/0626], Avg Loss: 0.8634 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8619 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8628 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8626 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8624 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8622 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8624 +INFO:master_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8624 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8625 +INFO:local_logger:Epoch[017/800], Step[0500/0626], Avg Loss: 0.8625 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8615 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8619 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8620 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8613 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8618 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8618 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8616 +INFO:master_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8617 +INFO:local_logger:Epoch[017/800], Step[0600/0626], Avg Loss: 0.8619 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8617, time: 890.30 +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8616, time: 890.30 +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8617, time: 890.31 +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8610, time: 890.96 +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8614, time: 887.14 +INFO:master_logger:----- Epoch[017/800], Train Loss: 0.8615, time: 887.14 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8616, time: 891.29 +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8617, time: 890.99 +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:----- Epoch[017/800], Train Loss: 0.8614, time: 892.15 +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-17-Loss-0.8613511298173326.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-17-Loss-0.8613511298173326.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-17-Loss-0.8613511298173326.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-17-Loss-0.8613511298173326.pdopt +INFO:local_logger:Now training epoch 18. LR=0.000068 +INFO:master_logger:Now training epoch 18. LR=0.000068 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8610 +INFO:master_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8573 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8499 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8529 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8567 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8551 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8589 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8601 +INFO:local_logger:Epoch[018/800], Step[0000/0626], Avg Loss: 0.8641 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8555 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8553 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8547 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8552 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8543 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8543 +INFO:master_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8547 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8542 +INFO:local_logger:Epoch[018/800], Step[0100/0626], Avg Loss: 0.8543 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8544 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8543 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8543 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8541 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8537 +INFO:master_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8541 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8544 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8539 +INFO:local_logger:Epoch[018/800], Step[0200/0626], Avg Loss: 0.8541 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8534 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8534 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8536 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8535 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8540 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8535 +INFO:master_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8537 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8544 +INFO:local_logger:Epoch[018/800], Step[0300/0626], Avg Loss: 0.8538 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8536 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8534 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8534 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8532 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8542 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8532 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8537 +INFO:master_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8535 +INFO:local_logger:Epoch[018/800], Step[0400/0626], Avg Loss: 0.8530 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8533 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8536 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8529 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8531 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8534 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8529 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8534 +INFO:master_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8533 +INFO:local_logger:Epoch[018/800], Step[0500/0626], Avg Loss: 0.8541 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8533 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8525 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8531 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8529 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8528 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8525 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8536 +INFO:local_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8528 +INFO:master_logger:Epoch[018/800], Step[0600/0626], Avg Loss: 0.8529 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8532, time: 859.28 +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8524, time: 859.95 +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8527, time: 855.56 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8523, time: 859.27 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8527, time: 859.94 +INFO:master_logger:----- Epoch[018/800], Train Loss: 0.8528, time: 855.56 +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8530, time: 859.29 +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8528, time: 859.96 +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:----- Epoch[018/800], Train Loss: 0.8534, time: 859.27 +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-18-Loss-0.8526818839083388.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-18-Loss-0.8526818839083388.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-18-Loss-0.8526818839083388.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-18-Loss-0.8526818839083388.pdopt +INFO:local_logger:Now training epoch 19. LR=0.000072 +INFO:master_logger:Now training epoch 19. LR=0.000072 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8466 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8442 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8470 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8424 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8531 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8522 +INFO:master_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8474 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8452 +INFO:local_logger:Epoch[019/800], Step[0000/0626], Avg Loss: 0.8487 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8481 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8485 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8477 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8475 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8477 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8473 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8491 +INFO:master_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8480 +INFO:local_logger:Epoch[019/800], Step[0100/0626], Avg Loss: 0.8481 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8476 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8477 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8476 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8481 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8480 +INFO:master_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8478 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8483 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8476 +INFO:local_logger:Epoch[019/800], Step[0200/0626], Avg Loss: 0.8478 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8470 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8465 +INFO:master_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8468 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8469 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8470 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8470 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8465 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8468 +INFO:local_logger:Epoch[019/800], Step[0300/0626], Avg Loss: 0.8467 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8458 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8460 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8460 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8464 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8461 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8460 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8465 +INFO:local_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8463 +INFO:master_logger:Epoch[019/800], Step[0400/0626], Avg Loss: 0.8461 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8452 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8454 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8450 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8455 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8451 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8452 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8452 +INFO:local_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8455 +INFO:master_logger:Epoch[019/800], Step[0500/0626], Avg Loss: 0.8452 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8445 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8442 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8446 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8445 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8447 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8445 +INFO:master_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8445 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8443 +INFO:local_logger:Epoch[019/800], Step[0600/0626], Avg Loss: 0.8445 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8446, time: 880.98 +INFO:master_logger:----- Epoch[019/800], Train Loss: 0.8443, time: 880.98 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8443, time: 885.40 +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8443, time: 885.43 +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8444, time: 885.46 +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8441, time: 885.49 +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8441, time: 885.53 +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8443, time: 885.54 +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:----- Epoch[019/800], Train Loss: 0.8446, time: 885.53 +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-19-Loss-0.8445631699389794.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-19-Loss-0.8445631699389794.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-19-Loss-0.8445631699389794.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-19-Loss-0.8445631699389794.pdopt +INFO:local_logger:Now training epoch 20. LR=0.000075 +INFO:master_logger:Now training epoch 20. LR=0.000075 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8395 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8579 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8337 +INFO:master_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8394 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8377 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8425 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8297 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8371 +INFO:local_logger:Epoch[020/800], Step[0000/0626], Avg Loss: 0.8374 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8389 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8399 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8385 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8402 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8398 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8399 +INFO:master_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8396 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8398 +INFO:local_logger:Epoch[020/800], Step[0100/0626], Avg Loss: 0.8400 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8402 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8410 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8400 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8406 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8409 +INFO:master_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8408 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8403 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8416 +INFO:local_logger:Epoch[020/800], Step[0200/0626], Avg Loss: 0.8415 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8399 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8411 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8404 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8406 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8406 +INFO:master_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8406 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8403 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8415 +INFO:local_logger:Epoch[020/800], Step[0300/0626], Avg Loss: 0.8403 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8397 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8397 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8393 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8400 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8395 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8400 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8397 +INFO:master_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8398 +INFO:local_logger:Epoch[020/800], Step[0400/0626], Avg Loss: 0.8404 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8384 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8386 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8384 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8388 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8383 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8387 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8384 +INFO:master_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8386 +INFO:local_logger:Epoch[020/800], Step[0500/0626], Avg Loss: 0.8391 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8387 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8378 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8380 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8382 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8383 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8381 +INFO:master_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8382 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8380 +INFO:local_logger:Epoch[020/800], Step[0600/0626], Avg Loss: 0.8383 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8379, time: 856.21 +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8378, time: 856.64 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8381, time: 856.54 +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8378, time: 856.55 +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8378, time: 856.54 +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8385, time: 856.62 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8379, time: 856.58 +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:----- Epoch[020/800], Train Loss: 0.8377, time: 853.74 +INFO:master_logger:----- Epoch[020/800], Train Loss: 0.8379, time: 853.74 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-20-Loss-0.837697342612629.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-20-Loss-0.837697342612629.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-20-Loss-0.837697342612629.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-20-Loss-0.837697342612629.pdopt +INFO:local_logger:Now training epoch 21. LR=0.000079 +INFO:master_logger:Now training epoch 21. LR=0.000079 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8247 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8468 +INFO:master_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8311 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8307 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8301 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8220 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8352 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8284 +INFO:local_logger:Epoch[021/800], Step[0000/0626], Avg Loss: 0.8313 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8345 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8327 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8336 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8344 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8331 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8343 +INFO:master_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8338 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8339 +INFO:local_logger:Epoch[021/800], Step[0100/0626], Avg Loss: 0.8339 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8343 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8340 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8344 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8339 +INFO:master_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8338 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8333 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8332 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8337 +INFO:local_logger:Epoch[021/800], Step[0200/0626], Avg Loss: 0.8335 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8328 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8336 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8330 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8331 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8331 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8337 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8328 +INFO:local_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8336 +INFO:master_logger:Epoch[021/800], Step[0300/0626], Avg Loss: 0.8332 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8324 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8322 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8329 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8326 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8333 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8324 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8332 +INFO:master_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8327 +INFO:local_logger:Epoch[021/800], Step[0400/0626], Avg Loss: 0.8328 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8323 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8322 +INFO:master_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8321 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8325 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8319 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8317 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8323 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8319 +INFO:local_logger:Epoch[021/800], Step[0500/0626], Avg Loss: 0.8320 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8319 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8316 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8318 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8314 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8317 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8314 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8315 +INFO:master_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8316 +INFO:local_logger:Epoch[021/800], Step[0600/0626], Avg Loss: 0.8317 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8314, time: 903.73 +INFO:master_logger:----- Epoch[021/800], Train Loss: 0.8313, time: 903.73 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8311, time: 908.09 +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8312, time: 908.50 +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8312, time: 908.98 +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8314, time: 908.52 +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8314, time: 908.52 +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8311, time: 908.52 +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:----- Epoch[021/800], Train Loss: 0.8317, time: 908.52 +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-21-Loss-0.8314437446567381.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-21-Loss-0.8314437446567381.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-21-Loss-0.8314437446567381.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-21-Loss-0.8314437446567381.pdopt +INFO:local_logger:Now training epoch 22. LR=0.000083 +INFO:master_logger:Now training epoch 22. LR=0.000083 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8238 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8287 +INFO:master_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8236 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8314 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8120 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8206 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8245 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8214 +INFO:local_logger:Epoch[022/800], Step[0000/0626], Avg Loss: 0.8266 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8256 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8255 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8256 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8256 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8274 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8262 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8270 +INFO:master_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8262 +INFO:local_logger:Epoch[022/800], Step[0100/0626], Avg Loss: 0.8269 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8246 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8258 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8262 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8250 +INFO:master_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8252 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8250 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8256 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8245 +INFO:local_logger:Epoch[022/800], Step[0200/0626], Avg Loss: 0.8253 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8236 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8237 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8235 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8239 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8245 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8242 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8232 +INFO:local_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8245 +INFO:master_logger:Epoch[022/800], Step[0300/0626], Avg Loss: 0.8239 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8234 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8226 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8230 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8239 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8240 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8231 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8231 +INFO:local_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8235 +INFO:master_logger:Epoch[022/800], Step[0400/0626], Avg Loss: 0.8233 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8231 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8231 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8222 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8226 +INFO:master_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8225 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8222 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8223 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8226 +INFO:local_logger:Epoch[022/800], Step[0500/0626], Avg Loss: 0.8222 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8221 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8219 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8219 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8220 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8223 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8226 +INFO:master_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8222 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8228 +INFO:local_logger:Epoch[022/800], Step[0600/0626], Avg Loss: 0.8222 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8221, time: 859.97 +INFO:master_logger:----- Epoch[022/800], Train Loss: 0.8221, time: 859.97 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8221, time: 863.27 +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8219, time: 864.02 +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8219, time: 863.88 +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8227, time: 863.94 +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8220, time: 863.94 +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8223, time: 863.94 +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:----- Epoch[022/800], Train Loss: 0.8219, time: 863.94 +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-22-Loss-0.8221496500572387.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-22-Loss-0.8221496500572387.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-22-Loss-0.8221496500572387.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-22-Loss-0.8221496500572387.pdopt +INFO:local_logger:Now training epoch 23. LR=0.000087 +INFO:master_logger:Now training epoch 23. LR=0.000087 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8185 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8079 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8203 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8216 +INFO:master_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8178 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8182 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8154 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8235 +INFO:local_logger:Epoch[023/800], Step[0000/0626], Avg Loss: 0.8168 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8184 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8174 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8173 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8176 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8182 +INFO:master_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8177 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8171 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8178 +INFO:local_logger:Epoch[023/800], Step[0100/0626], Avg Loss: 0.8173 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8173 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8172 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8169 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8168 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8171 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8171 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8172 +INFO:local_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8171 +INFO:master_logger:Epoch[023/800], Step[0200/0626], Avg Loss: 0.8171 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8166 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8163 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8166 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8167 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8164 +INFO:master_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8166 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8170 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8170 +INFO:local_logger:Epoch[023/800], Step[0300/0626], Avg Loss: 0.8166 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8161 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8159 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8161 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8159 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8158 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8165 +INFO:master_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8161 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8163 +INFO:local_logger:Epoch[023/800], Step[0400/0626], Avg Loss: 0.8165 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8157 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8160 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8161 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8155 +INFO:master_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8157 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8154 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8155 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8156 +INFO:local_logger:Epoch[023/800], Step[0500/0626], Avg Loss: 0.8155 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8151 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8149 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8154 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8151 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8152 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8149 +INFO:master_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8151 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8151 +INFO:local_logger:Epoch[023/800], Step[0600/0626], Avg Loss: 0.8153 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8150, time: 884.65 +INFO:master_logger:----- Epoch[023/800], Train Loss: 0.8150, time: 884.65 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8151, time: 888.50 +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8152, time: 889.17 +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8152, time: 888.59 +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8148, time: 888.85 +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8149, time: 888.51 +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8149, time: 888.52 +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:----- Epoch[023/800], Train Loss: 0.8146, time: 888.53 +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-23-Loss-0.8150022067021212.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-23-Loss-0.8150022067021212.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-23-Loss-0.8150022067021212.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-23-Loss-0.8150022067021212.pdopt +INFO:local_logger:Now training epoch 24. LR=0.000090 +INFO:master_logger:Now training epoch 24. LR=0.000090 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8150 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8170 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8108 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8043 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8090 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8224 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8215 +INFO:master_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8135 +INFO:local_logger:Epoch[024/800], Step[0000/0626], Avg Loss: 0.8082 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8115 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8115 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8110 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8112 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8117 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8124 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8113 +INFO:master_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8114 +INFO:local_logger:Epoch[024/800], Step[0100/0626], Avg Loss: 0.8105 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8104 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8110 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8111 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8115 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8114 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8115 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8108 +INFO:local_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8112 +INFO:master_logger:Epoch[024/800], Step[0200/0626], Avg Loss: 0.8111 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8101 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8100 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8106 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8106 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8099 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8100 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8101 +INFO:local_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8107 +INFO:master_logger:Epoch[024/800], Step[0300/0626], Avg Loss: 0.8103 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8096 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8096 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8096 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8099 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8098 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8097 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8101 +INFO:master_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8098 +INFO:local_logger:Epoch[024/800], Step[0400/0626], Avg Loss: 0.8103 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8089 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8095 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8094 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8090 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8094 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8090 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8091 +INFO:master_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8092 +INFO:local_logger:Epoch[024/800], Step[0500/0626], Avg Loss: 0.8090 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8088 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8088 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8093 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8088 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8087 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8091 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8091 +INFO:master_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8089 +INFO:local_logger:Epoch[024/800], Step[0600/0626], Avg Loss: 0.8088 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8087, time: 870.26 +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8084, time: 870.28 +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8092, time: 867.64 +INFO:master_logger:----- Epoch[024/800], Train Loss: 0.8088, time: 867.64 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8090, time: 870.78 +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8089, time: 870.77 +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8086, time: 870.75 +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8086, time: 870.78 +INFO:local_logger:----- Epoch[024/800], Train Loss: 0.8087, time: 870.75 +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-24-Loss-0.8091736378081739.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-24-Loss-0.8091736378081739.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-24-Loss-0.8091736378081739.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-24-Loss-0.8091736378081739.pdopt +INFO:local_logger:Now training epoch 25. LR=0.000094 +INFO:master_logger:Now training epoch 25. LR=0.000094 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8030 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8073 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.7969 +INFO:master_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8051 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8085 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8000 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8034 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8158 +INFO:local_logger:Epoch[025/800], Step[0000/0626], Avg Loss: 0.8061 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8055 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8046 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8059 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8052 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8053 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8054 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8052 +INFO:local_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8057 +INFO:master_logger:Epoch[025/800], Step[0100/0626], Avg Loss: 0.8053 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8049 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8050 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8051 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8047 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8054 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8050 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8057 +INFO:master_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8051 +INFO:local_logger:Epoch[025/800], Step[0200/0626], Avg Loss: 0.8051 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8052 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8049 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8046 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8044 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8048 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8046 +INFO:master_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8047 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8044 +INFO:local_logger:Epoch[025/800], Step[0300/0626], Avg Loss: 0.8044 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8043 +INFO:master_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8042 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8042 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8043 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8040 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8045 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8038 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8043 +INFO:local_logger:Epoch[025/800], Step[0400/0626], Avg Loss: 0.8043 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8040 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8039 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8032 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8034 +INFO:master_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8036 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8037 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8037 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8037 +INFO:local_logger:Epoch[025/800], Step[0500/0626], Avg Loss: 0.8035 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8036 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8029 +INFO:master_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8033 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8032 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8032 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8030 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8035 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8036 +INFO:local_logger:Epoch[025/800], Step[0600/0626], Avg Loss: 0.8035 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8033, time: 888.93 +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8035, time: 885.88 +INFO:master_logger:----- Epoch[025/800], Train Loss: 0.8032, time: 885.88 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8029, time: 889.72 +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8033, time: 889.81 +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8031, time: 889.83 +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8031, time: 890.36 +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8035, time: 890.36 +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:----- Epoch[025/800], Train Loss: 0.8028, time: 889.87 +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-25-Loss-0.8034991228641365.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-25-Loss-0.8034991228641365.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-25-Loss-0.8034991228641365.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-25-Loss-0.8034991228641365.pdopt +INFO:local_logger:Now training epoch 26. LR=0.000098 +INFO:master_logger:Now training epoch 26. LR=0.000098 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.7943 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.7989 +INFO:master_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.7988 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.7899 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.7949 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.7996 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.8022 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.8063 +INFO:local_logger:Epoch[026/800], Step[0000/0626], Avg Loss: 0.8043 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7989 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7997 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7993 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7994 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7998 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7976 +INFO:master_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7992 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7992 +INFO:local_logger:Epoch[026/800], Step[0100/0626], Avg Loss: 0.7992 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7993 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7986 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7985 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7986 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7987 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7988 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7980 +INFO:master_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7988 +INFO:local_logger:Epoch[026/800], Step[0200/0626], Avg Loss: 0.7999 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7980 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7983 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7983 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7992 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7980 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7983 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7985 +INFO:master_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7983 +INFO:local_logger:Epoch[026/800], Step[0300/0626], Avg Loss: 0.7979 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7977 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7973 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7976 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7977 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7975 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7976 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7979 +INFO:master_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7977 +INFO:local_logger:Epoch[026/800], Step[0400/0626], Avg Loss: 0.7984 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7970 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7967 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7967 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7976 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7970 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7965 +INFO:master_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7969 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7968 +INFO:local_logger:Epoch[026/800], Step[0500/0626], Avg Loss: 0.7969 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7961 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7961 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7959 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7961 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7969 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7961 +INFO:master_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7962 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7963 +INFO:local_logger:Epoch[026/800], Step[0600/0626], Avg Loss: 0.7964 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7968, time: 871.34 +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7961, time: 867.96 +INFO:master_logger:----- Epoch[026/800], Train Loss: 0.7962, time: 867.96 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7959, time: 871.51 +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7960, time: 871.97 +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7960, time: 871.99 +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7963, time: 872.03 +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7962, time: 872.93 +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:----- Epoch[026/800], Train Loss: 0.7961, time: 872.05 +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-26-Loss-0.796081899062454.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-26-Loss-0.796081899062454.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-26-Loss-0.796081899062454.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-26-Loss-0.796081899062454.pdopt +INFO:local_logger:Now training epoch 27. LR=0.000102 +INFO:master_logger:Now training epoch 27. LR=0.000102 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.7991 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.8009 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.7909 +INFO:master_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.7963 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.8054 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.7875 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.7990 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.7946 +INFO:local_logger:Epoch[027/800], Step[0000/0626], Avg Loss: 0.7932 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7923 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7922 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7923 +INFO:master_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7920 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7924 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7913 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7925 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7917 +INFO:local_logger:Epoch[027/800], Step[0100/0626], Avg Loss: 0.7914 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7922 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7922 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7924 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7924 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7915 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7919 +INFO:master_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7920 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7914 +INFO:local_logger:Epoch[027/800], Step[0200/0626], Avg Loss: 0.7921 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7914 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7917 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7905 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7909 +INFO:master_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7911 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7907 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7915 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7912 +INFO:local_logger:Epoch[027/800], Step[0300/0626], Avg Loss: 0.7909 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7909 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7900 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7909 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7902 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7904 +INFO:master_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7906 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7907 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7903 +INFO:local_logger:Epoch[027/800], Step[0400/0626], Avg Loss: 0.7911 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7902 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7899 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7898 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7903 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7903 +INFO:master_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7900 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7904 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7895 +INFO:local_logger:Epoch[027/800], Step[0500/0626], Avg Loss: 0.7898 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7896 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7893 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7898 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7894 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7893 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7899 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7891 +INFO:local_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7894 +INFO:master_logger:Epoch[027/800], Step[0600/0626], Avg Loss: 0.7895 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7892, time: 878.78 +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7897, time: 878.81 +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7894, time: 875.73 +INFO:master_logger:----- Epoch[027/800], Train Loss: 0.7893, time: 875.73 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7890, time: 878.88 +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7892, time: 878.97 +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7891, time: 879.29 +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7895, time: 879.32 +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:----- Epoch[027/800], Train Loss: 0.7895, time: 879.32 +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-27-Loss-0.7894227700390196.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-27-Loss-0.7894227700390196.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-27-Loss-0.7894227700390196.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-27-Loss-0.7894227700390196.pdopt +INFO:local_logger:Now training epoch 28. LR=0.000105 +INFO:master_logger:Now training epoch 28. LR=0.000105 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7943 +INFO:master_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7880 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7931 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7814 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7872 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7910 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7793 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7897 +INFO:local_logger:Epoch[028/800], Step[0000/0626], Avg Loss: 0.7883 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7855 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7868 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7873 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7853 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7862 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7875 +INFO:master_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7866 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7874 +INFO:local_logger:Epoch[028/800], Step[0100/0626], Avg Loss: 0.7871 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7865 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7866 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7859 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7855 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7855 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7858 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7860 +INFO:master_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7861 +INFO:local_logger:Epoch[028/800], Step[0200/0626], Avg Loss: 0.7865 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7850 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7850 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7851 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7845 +INFO:master_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7851 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7851 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7855 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7846 +INFO:local_logger:Epoch[028/800], Step[0300/0626], Avg Loss: 0.7860 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7841 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7844 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7839 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7850 +INFO:master_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7843 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7837 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7843 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7843 +INFO:local_logger:Epoch[028/800], Step[0400/0626], Avg Loss: 0.7845 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7842 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7838 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7835 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7841 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7840 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7839 +INFO:master_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7840 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7846 +INFO:local_logger:Epoch[028/800], Step[0500/0626], Avg Loss: 0.7837 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7838 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7831 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7837 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7836 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7836 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7840 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7839 +INFO:local_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7833 +INFO:master_logger:Epoch[028/800], Step[0600/0626], Avg Loss: 0.7836 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7837, time: 871.38 +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7835, time: 868.22 +INFO:master_logger:----- Epoch[028/800], Train Loss: 0.7835, time: 868.22 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7831, time: 872.18 +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7830, time: 873.00 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7835, time: 871.85 +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7834, time: 871.89 +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7837, time: 873.03 +INFO:local_logger:----- Epoch[028/800], Train Loss: 0.7838, time: 872.29 +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-28-Loss-0.783455797444855.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-28-Loss-0.783455797444855.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-28-Loss-0.783455797444855.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-28-Loss-0.783455797444855.pdopt +INFO:local_logger:Now training epoch 29. LR=0.000109 +INFO:master_logger:Now training epoch 29. LR=0.000109 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7766 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7720 +INFO:master_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7785 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7816 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7862 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7662 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7767 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7916 +INFO:local_logger:Epoch[029/800], Step[0000/0626], Avg Loss: 0.7771 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7823 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7822 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7822 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7816 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7818 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7815 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7809 +INFO:master_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7818 +INFO:local_logger:Epoch[029/800], Step[0100/0626], Avg Loss: 0.7822 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7802 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7802 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7795 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7806 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7806 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7807 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7800 +INFO:local_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7800 +INFO:master_logger:Epoch[029/800], Step[0200/0626], Avg Loss: 0.7802 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7794 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7794 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7796 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7798 +INFO:master_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7795 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7797 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7796 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7797 +INFO:local_logger:Epoch[029/800], Step[0300/0626], Avg Loss: 0.7789 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7788 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7781 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7786 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7787 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7785 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7787 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7786 +INFO:master_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7786 +INFO:local_logger:Epoch[029/800], Step[0400/0626], Avg Loss: 0.7788 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7782 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7776 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7779 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7781 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7782 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7780 +INFO:master_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7781 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7781 +INFO:local_logger:Epoch[029/800], Step[0500/0626], Avg Loss: 0.7782 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7776 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7777 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7777 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7778 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7776 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7776 +INFO:master_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7776 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7772 +INFO:local_logger:Epoch[029/800], Step[0600/0626], Avg Loss: 0.7776 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7778, time: 869.47 +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7776, time: 865.73 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7773, time: 868.98 +INFO:master_logger:----- Epoch[029/800], Train Loss: 0.7776, time: 865.73 +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7777, time: 869.01 +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7777, time: 869.04 +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7774, time: 869.04 +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7776, time: 869.04 +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:----- Epoch[029/800], Train Loss: 0.7776, time: 869.04 +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-29-Loss-0.77762673581754.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-29-Loss-0.77762673581754.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-29-Loss-0.77762673581754.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-29-Loss-0.77762673581754.pdopt +INFO:local_logger:Now training epoch 30. LR=0.000113 +INFO:master_logger:Now training epoch 30. LR=0.000113 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7671 +INFO:master_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7736 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7764 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7666 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7815 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7786 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7819 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7683 +INFO:local_logger:Epoch[030/800], Step[0000/0626], Avg Loss: 0.7686 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7759 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7765 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7745 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7773 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7759 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7772 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7763 +INFO:master_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7762 +INFO:local_logger:Epoch[030/800], Step[0100/0626], Avg Loss: 0.7756 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7744 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7748 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7754 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7743 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7742 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7749 +INFO:master_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7747 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7747 +INFO:local_logger:Epoch[030/800], Step[0200/0626], Avg Loss: 0.7747 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7740 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7748 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7739 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7744 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7741 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7739 +INFO:master_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7741 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7744 +INFO:local_logger:Epoch[030/800], Step[0300/0626], Avg Loss: 0.7736 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7737 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7736 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7732 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7735 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7740 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7732 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7734 +INFO:local_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7732 +INFO:master_logger:Epoch[030/800], Step[0400/0626], Avg Loss: 0.7735 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7730 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7729 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7729 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7732 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7731 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7727 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7729 +INFO:local_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7725 +INFO:master_logger:Epoch[030/800], Step[0500/0626], Avg Loss: 0.7729 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7723 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7723 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7722 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7720 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7726 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7724 +INFO:master_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7722 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7723 +INFO:local_logger:Epoch[030/800], Step[0600/0626], Avg Loss: 0.7719 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7722, time: 884.56 +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7718, time: 884.68 +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7719, time: 881.45 +INFO:master_logger:----- Epoch[030/800], Train Loss: 0.7721, time: 881.45 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7721, time: 885.20 +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7721, time: 885.20 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7722, time: 885.14 +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7724, time: 885.17 +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:----- Epoch[030/800], Train Loss: 0.7723, time: 885.20 +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-30-Loss-0.7718740027551073.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-30-Loss-0.7718740027551073.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-30-Loss-0.7718740027551073.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-30-Loss-0.7718740027551073.pdopt +INFO:local_logger:Now training epoch 31. LR=0.000116 +INFO:master_logger:Now training epoch 31. LR=0.000116 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7753 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7631 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7707 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7713 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7666 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7545 +INFO:master_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7651 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7590 +INFO:local_logger:Epoch[031/800], Step[0000/0626], Avg Loss: 0.7607 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7683 +INFO:master_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7684 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7667 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7685 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7688 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7677 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7694 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7688 +INFO:local_logger:Epoch[031/800], Step[0100/0626], Avg Loss: 0.7686 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7679 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7681 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7678 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7680 +INFO:master_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7680 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7683 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7685 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7681 +INFO:local_logger:Epoch[031/800], Step[0200/0626], Avg Loss: 0.7672 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7678 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7684 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7676 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7680 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7676 +INFO:master_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7677 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7668 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7674 +INFO:local_logger:Epoch[031/800], Step[0300/0626], Avg Loss: 0.7679 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7667 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7676 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7674 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7676 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7675 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7678 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7683 +INFO:local_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7678 +INFO:master_logger:Epoch[031/800], Step[0400/0626], Avg Loss: 0.7676 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7669 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7673 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7674 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7670 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7672 +INFO:master_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7671 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7663 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7671 +INFO:local_logger:Epoch[031/800], Step[0500/0626], Avg Loss: 0.7676 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7669 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7668 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7664 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7667 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7661 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7667 +INFO:master_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7667 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7670 +INFO:local_logger:Epoch[031/800], Step[0600/0626], Avg Loss: 0.7669 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7663, time: 868.34 +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7660, time: 868.35 +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7666, time: 868.93 +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7667, time: 868.45 +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7668, time: 868.50 +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7666, time: 864.87 +INFO:master_logger:----- Epoch[031/800], Train Loss: 0.7665, time: 864.87 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7668, time: 869.11 +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:----- Epoch[031/800], Train Loss: 0.7665, time: 868.66 +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-31-Loss-0.7666413477179265.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-31-Loss-0.7666413477179265.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-31-Loss-0.7666413477179265.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-31-Loss-0.7666413477179265.pdopt +INFO:local_logger:Now training epoch 32. LR=0.000120 +INFO:master_logger:Now training epoch 32. LR=0.000120 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7643 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7629 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7742 +INFO:master_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7666 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7638 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7575 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7728 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7710 +INFO:local_logger:Epoch[032/800], Step[0000/0626], Avg Loss: 0.7662 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7641 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7647 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7646 +INFO:master_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7645 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7645 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7638 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7636 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7646 +INFO:local_logger:Epoch[032/800], Step[0100/0626], Avg Loss: 0.7657 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7632 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7627 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7635 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7633 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7638 +INFO:master_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7635 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7637 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7629 +INFO:local_logger:Epoch[032/800], Step[0200/0626], Avg Loss: 0.7644 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7629 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7628 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7626 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7631 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7633 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7634 +INFO:master_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7629 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7626 +INFO:local_logger:Epoch[032/800], Step[0300/0626], Avg Loss: 0.7628 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7623 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7625 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7620 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7630 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7621 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7628 +INFO:master_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7624 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7622 +INFO:local_logger:Epoch[032/800], Step[0400/0626], Avg Loss: 0.7621 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7617 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7623 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7623 +INFO:master_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7619 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7618 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7617 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7619 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7617 +INFO:local_logger:Epoch[032/800], Step[0500/0626], Avg Loss: 0.7617 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7613 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7613 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7616 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7612 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7615 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7612 +INFO:master_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7614 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7616 +INFO:local_logger:Epoch[032/800], Step[0600/0626], Avg Loss: 0.7611 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7611, time: 877.39 +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7610, time: 878.25 +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7609, time: 878.35 +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7615, time: 874.33 +INFO:master_logger:----- Epoch[032/800], Train Loss: 0.7612, time: 874.33 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7612, time: 878.35 +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7612, time: 878.38 +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7612, time: 878.23 +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:----- Epoch[032/800], Train Loss: 0.7615, time: 878.10 +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-32-Loss-0.761453199911086.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-32-Loss-0.761453199911086.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-32-Loss-0.761453199911086.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-32-Loss-0.761453199911086.pdopt +INFO:local_logger:Now training epoch 33. LR=0.000124 +INFO:master_logger:Now training epoch 33. LR=0.000124 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7557 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7650 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7551 +INFO:master_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7556 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7455 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7599 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7565 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7625 +INFO:local_logger:Epoch[033/800], Step[0000/0626], Avg Loss: 0.7448 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7586 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7590 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7584 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7588 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7580 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7570 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7575 +INFO:local_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7570 +INFO:master_logger:Epoch[033/800], Step[0100/0626], Avg Loss: 0.7580 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7577 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7575 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7584 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7580 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7584 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7571 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7574 +INFO:master_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7578 +INFO:local_logger:Epoch[033/800], Step[0200/0626], Avg Loss: 0.7576 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7572 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7579 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7576 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7568 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7577 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7579 +INFO:master_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7575 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7579 +INFO:local_logger:Epoch[033/800], Step[0300/0626], Avg Loss: 0.7572 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7574 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7571 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7568 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7573 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7571 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7573 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7572 +INFO:local_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7565 +INFO:master_logger:Epoch[033/800], Step[0400/0626], Avg Loss: 0.7571 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7565 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7567 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7570 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7569 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7564 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7570 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7569 +INFO:master_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7568 +INFO:local_logger:Epoch[033/800], Step[0500/0626], Avg Loss: 0.7569 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7564 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7561 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7567 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7561 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7566 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7567 +INFO:master_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7564 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7563 +INFO:local_logger:Epoch[033/800], Step[0600/0626], Avg Loss: 0.7566 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7561, time: 852.50 +INFO:master_logger:----- Epoch[033/800], Train Loss: 0.7564, time: 852.50 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7566, time: 856.70 +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7566, time: 856.72 +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7560, time: 856.75 +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7566, time: 857.55 +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7562, time: 856.91 +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7565, time: 856.90 +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:----- Epoch[033/800], Train Loss: 0.7563, time: 856.92 +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-33-Loss-0.7561021761674307.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-33-Loss-0.7561021761674307.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-33-Loss-0.7561021761674307.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-33-Loss-0.7561021761674307.pdopt +INFO:local_logger:Now training epoch 34. LR=0.000128 +INFO:master_logger:Now training epoch 34. LR=0.000128 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7407 +INFO:master_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7518 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7494 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7433 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7598 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7563 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7612 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7477 +INFO:local_logger:Epoch[034/800], Step[0000/0626], Avg Loss: 0.7559 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7547 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7548 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7543 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7539 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7534 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7536 +INFO:master_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7540 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7534 +INFO:local_logger:Epoch[034/800], Step[0100/0626], Avg Loss: 0.7540 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7539 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7529 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7541 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7541 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7531 +INFO:master_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7536 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7536 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7534 +INFO:local_logger:Epoch[034/800], Step[0200/0626], Avg Loss: 0.7533 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7529 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7533 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7532 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7531 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7528 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7534 +INFO:master_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7531 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7539 +INFO:local_logger:Epoch[034/800], Step[0300/0626], Avg Loss: 0.7523 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7527 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7527 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7526 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7520 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7536 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7525 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7525 +INFO:local_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7528 +INFO:master_logger:Epoch[034/800], Step[0400/0626], Avg Loss: 0.7527 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7526 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7520 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7518 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7523 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7525 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7524 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7522 +INFO:local_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7532 +INFO:master_logger:Epoch[034/800], Step[0500/0626], Avg Loss: 0.7524 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7520 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7521 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7516 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7522 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7516 +INFO:master_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7520 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7521 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7526 +INFO:local_logger:Epoch[034/800], Step[0600/0626], Avg Loss: 0.7516 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7519, time: 885.18 +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7520, time: 885.17 +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7519, time: 885.32 +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7516, time: 882.34 +INFO:master_logger:----- Epoch[034/800], Train Loss: 0.7519, time: 882.34 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7522, time: 885.50 +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7516, time: 885.48 +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7516, time: 885.48 +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:----- Epoch[034/800], Train Loss: 0.7524, time: 885.48 +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-34-Loss-0.7516406552539668.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-34-Loss-0.7516406552539668.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-34-Loss-0.7516406552539668.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-34-Loss-0.7516406552539668.pdopt +INFO:local_logger:Now training epoch 35. LR=0.000131 +INFO:master_logger:Now training epoch 35. LR=0.000131 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7487 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7414 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7520 +INFO:master_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7511 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7453 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7505 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7633 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7544 +INFO:local_logger:Epoch[035/800], Step[0000/0626], Avg Loss: 0.7535 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7500 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7504 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7496 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7499 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7493 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7492 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7490 +INFO:master_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7494 +INFO:local_logger:Epoch[035/800], Step[0100/0626], Avg Loss: 0.7481 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7496 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7496 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7495 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7485 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7487 +INFO:master_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7493 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7490 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7498 +INFO:local_logger:Epoch[035/800], Step[0200/0626], Avg Loss: 0.7493 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7484 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7486 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7491 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7494 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7494 +INFO:master_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7490 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7491 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7488 +INFO:local_logger:Epoch[035/800], Step[0300/0626], Avg Loss: 0.7495 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7488 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7490 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7483 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7490 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7484 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7489 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7480 +INFO:master_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7486 +INFO:local_logger:Epoch[035/800], Step[0400/0626], Avg Loss: 0.7484 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7485 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7488 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7483 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7482 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7479 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7482 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7491 +INFO:master_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7484 +INFO:local_logger:Epoch[035/800], Step[0500/0626], Avg Loss: 0.7484 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7482 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7485 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7478 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7480 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7480 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7481 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7478 +INFO:local_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7485 +INFO:master_logger:Epoch[035/800], Step[0600/0626], Avg Loss: 0.7481 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7484, time: 858.24 +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7479, time: 859.58 +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7481, time: 859.55 +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7480, time: 859.83 +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7480, time: 859.45 +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7484, time: 859.45 +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7477, time: 855.75 +INFO:master_logger:----- Epoch[035/800], Train Loss: 0.7480, time: 855.75 +INFO:local_logger:----- Epoch[035/800], Train Loss: 0.7478, time: 859.47 +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-35-Loss-0.7477136553201034.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-35-Loss-0.7477136553201034.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-35-Loss-0.7477136553201034.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-35-Loss-0.7477136553201034.pdopt +INFO:local_logger:Now training epoch 36. LR=0.000135 +INFO:master_logger:Now training epoch 36. LR=0.000135 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7546 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7470 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7469 +INFO:master_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7497 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7536 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7544 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7537 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7427 +INFO:local_logger:Epoch[036/800], Step[0000/0626], Avg Loss: 0.7446 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7474 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7461 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7458 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7467 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7456 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7469 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7457 +INFO:master_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7463 +INFO:local_logger:Epoch[036/800], Step[0100/0626], Avg Loss: 0.7461 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7459 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7473 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7459 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7461 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7460 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7450 +INFO:master_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7460 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7464 +INFO:local_logger:Epoch[036/800], Step[0200/0626], Avg Loss: 0.7453 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7464 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7457 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7456 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7461 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7456 +INFO:master_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7456 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7451 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7456 +INFO:local_logger:Epoch[036/800], Step[0300/0626], Avg Loss: 0.7450 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7453 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7449 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7457 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7456 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7456 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7450 +INFO:master_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7453 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7452 +INFO:local_logger:Epoch[036/800], Step[0400/0626], Avg Loss: 0.7449 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7452 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7450 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7453 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7449 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7449 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7452 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7448 +INFO:local_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7451 +INFO:master_logger:Epoch[036/800], Step[0500/0626], Avg Loss: 0.7451 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7446 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7446 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7446 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7449 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7446 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7444 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7450 +INFO:local_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7450 +INFO:master_logger:Epoch[036/800], Step[0600/0626], Avg Loss: 0.7447 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7449, time: 890.54 +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7445, time: 891.48 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7445, time: 891.09 +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7446, time: 887.41 +INFO:master_logger:----- Epoch[036/800], Train Loss: 0.7447, time: 887.41 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7448, time: 891.48 +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7449, time: 891.12 +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7446, time: 892.33 +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:----- Epoch[036/800], Train Loss: 0.7444, time: 891.12 +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-36-Loss-0.7446498155174043.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-36-Loss-0.7446498155174043.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-36-Loss-0.7446498155174043.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-36-Loss-0.7446498155174043.pdopt +INFO:local_logger:Now training epoch 37. LR=0.000139 +INFO:master_logger:Now training epoch 37. LR=0.000139 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7435 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7441 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7454 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7397 +INFO:master_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7456 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7399 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7476 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7504 +INFO:local_logger:Epoch[037/800], Step[0000/0626], Avg Loss: 0.7542 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7433 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7414 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7429 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7416 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7421 +INFO:master_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7426 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7423 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7431 +INFO:local_logger:Epoch[037/800], Step[0100/0626], Avg Loss: 0.7438 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7429 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7434 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7425 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7427 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7427 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7428 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7429 +INFO:local_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7424 +INFO:master_logger:Epoch[037/800], Step[0200/0626], Avg Loss: 0.7428 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7422 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7425 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7417 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7424 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7423 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7424 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7427 +INFO:local_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7425 +INFO:master_logger:Epoch[037/800], Step[0300/0626], Avg Loss: 0.7423 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7423 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7419 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7416 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7420 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7422 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7414 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7420 +INFO:local_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7419 +INFO:master_logger:Epoch[037/800], Step[0400/0626], Avg Loss: 0.7419 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7414 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7414 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7416 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7419 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7417 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7419 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7415 +INFO:local_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7418 +INFO:master_logger:Epoch[037/800], Step[0500/0626], Avg Loss: 0.7417 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7410 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7416 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7416 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7417 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7413 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7413 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7412 +INFO:master_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7414 +INFO:local_logger:Epoch[037/800], Step[0600/0626], Avg Loss: 0.7414 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7415, time: 861.30 +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7415, time: 861.30 +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7414, time: 861.29 +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7412, time: 861.53 +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7412, time: 862.22 +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7411, time: 861.67 +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7410, time: 861.66 +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:----- Epoch[037/800], Train Loss: 0.7415, time: 858.01 +INFO:master_logger:----- Epoch[037/800], Train Loss: 0.7413, time: 858.01 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-37-Loss-0.7415279359559235.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-37-Loss-0.7415279359559235.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-37-Loss-0.7415279359559235.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-37-Loss-0.7415279359559235.pdopt +INFO:local_logger:Now training epoch 38. LR=0.000143 +INFO:master_logger:Now training epoch 38. LR=0.000143 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7326 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7470 +INFO:master_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7394 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7295 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7452 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7429 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7354 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7382 +INFO:local_logger:Epoch[038/800], Step[0000/0626], Avg Loss: 0.7443 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7383 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7398 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7393 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7389 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7386 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7387 +INFO:master_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7392 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7411 +INFO:local_logger:Epoch[038/800], Step[0100/0626], Avg Loss: 0.7392 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7388 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7388 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7386 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7386 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7399 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7386 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7391 +INFO:master_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7389 +INFO:local_logger:Epoch[038/800], Step[0200/0626], Avg Loss: 0.7389 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7388 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7390 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7388 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7386 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7388 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7384 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7385 +INFO:master_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7387 +INFO:local_logger:Epoch[038/800], Step[0300/0626], Avg Loss: 0.7389 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7382 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7388 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7390 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7387 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7388 +INFO:master_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7387 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7384 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7386 +INFO:local_logger:Epoch[038/800], Step[0400/0626], Avg Loss: 0.7388 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7383 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7385 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7385 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7385 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7390 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7377 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7385 +INFO:master_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7385 +INFO:local_logger:Epoch[038/800], Step[0500/0626], Avg Loss: 0.7387 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7378 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7382 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7389 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7381 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7381 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7381 +INFO:master_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7383 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7381 +INFO:local_logger:Epoch[038/800], Step[0600/0626], Avg Loss: 0.7386 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7381, time: 895.61 +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7378, time: 895.58 +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7380, time: 895.99 +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7381, time: 896.21 +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7381, time: 892.19 +INFO:master_logger:----- Epoch[038/800], Train Loss: 0.7382, time: 892.19 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7385, time: 895.95 +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7388, time: 896.06 +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:----- Epoch[038/800], Train Loss: 0.7381, time: 895.94 +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-38-Loss-0.7380608445157859.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-38-Loss-0.7380608445157859.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-38-Loss-0.7380608445157859.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-38-Loss-0.7380608445157859.pdopt +INFO:local_logger:Now training epoch 39. LR=0.000146 +INFO:master_logger:Now training epoch 39. LR=0.000146 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7392 +INFO:master_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7382 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7373 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7385 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7375 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7414 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7378 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7312 +INFO:local_logger:Epoch[039/800], Step[0000/0626], Avg Loss: 0.7426 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7358 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7363 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7362 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7361 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7359 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7369 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7365 +INFO:master_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7363 +INFO:local_logger:Epoch[039/800], Step[0100/0626], Avg Loss: 0.7366 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7366 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7359 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7371 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7356 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7361 +INFO:master_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7362 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7363 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7364 +INFO:local_logger:Epoch[039/800], Step[0200/0626], Avg Loss: 0.7360 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7354 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7356 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7368 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7364 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7363 +INFO:master_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7360 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7357 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7359 +INFO:local_logger:Epoch[039/800], Step[0300/0626], Avg Loss: 0.7360 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7359 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7357 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7355 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7359 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7359 +INFO:master_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7358 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7365 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7355 +INFO:local_logger:Epoch[039/800], Step[0400/0626], Avg Loss: 0.7356 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7352 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7352 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7356 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7355 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7358 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7352 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7356 +INFO:master_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7354 +INFO:local_logger:Epoch[039/800], Step[0500/0626], Avg Loss: 0.7353 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7351 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7351 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7354 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7353 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7351 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7356 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7356 +INFO:master_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7354 +INFO:local_logger:Epoch[039/800], Step[0600/0626], Avg Loss: 0.7355 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7351, time: 865.25 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7350, time: 864.90 +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7353, time: 860.87 +INFO:master_logger:----- Epoch[039/800], Train Loss: 0.7353, time: 860.87 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7352, time: 864.70 +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7355, time: 864.66 +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7355, time: 864.70 +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7356, time: 864.70 +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:----- Epoch[039/800], Train Loss: 0.7351, time: 865.02 +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-39-Loss-0.7353187804344304.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-39-Loss-0.7353187804344304.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-39-Loss-0.7353187804344304.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-39-Loss-0.7353187804344304.pdopt +INFO:local_logger:Now training epoch 40. LR=0.000150 +INFO:master_logger:Now training epoch 40. LR=0.000150 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7343 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7310 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7444 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7305 +INFO:master_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7355 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7357 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7310 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7439 +INFO:local_logger:Epoch[040/800], Step[0000/0626], Avg Loss: 0.7330 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7348 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7344 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7346 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7348 +INFO:master_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7341 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7345 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7336 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7331 +INFO:local_logger:Epoch[040/800], Step[0100/0626], Avg Loss: 0.7333 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7340 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7343 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7335 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7340 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7335 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7334 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7326 +INFO:master_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7336 +INFO:local_logger:Epoch[040/800], Step[0200/0626], Avg Loss: 0.7339 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7338 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7336 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7331 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7338 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7332 +INFO:master_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7334 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7329 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7334 +INFO:local_logger:Epoch[040/800], Step[0300/0626], Avg Loss: 0.7338 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7335 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7335 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7336 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7334 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7328 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7332 +INFO:master_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7333 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7332 +INFO:local_logger:Epoch[040/800], Step[0400/0626], Avg Loss: 0.7330 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7330 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7323 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7333 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7333 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7329 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7334 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7330 +INFO:local_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7329 +INFO:master_logger:Epoch[040/800], Step[0500/0626], Avg Loss: 0.7330 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7329 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7329 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7328 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7329 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7328 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7329 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7322 +INFO:master_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7328 +INFO:local_logger:Epoch[040/800], Step[0600/0626], Avg Loss: 0.7328 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7328, time: 895.01 +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7328, time: 890.99 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7328, time: 895.12 +INFO:master_logger:----- Epoch[040/800], Train Loss: 0.7327, time: 890.99 +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7328, time: 895.13 +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7328, time: 894.99 +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7328, time: 894.98 +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7321, time: 894.99 +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:----- Epoch[040/800], Train Loss: 0.7327, time: 895.07 +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-40-Loss-0.7327702230552797.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-40-Loss-0.7327702230552797.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-40-Loss-0.7327702230552797.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-40-Loss-0.7327702230552797.pdopt +INFO:local_logger:Now training epoch 41. LR=0.000150 +INFO:master_logger:Now training epoch 41. LR=0.000150 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7292 +INFO:master_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7311 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7233 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7395 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7295 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7279 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7431 +INFO:local_logger:Epoch[041/800], Step[0000/0626], Avg Loss: 0.7341 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7312 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7315 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7304 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7317 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7307 +INFO:master_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7311 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7308 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7302 +INFO:local_logger:Epoch[041/800], Step[0100/0626], Avg Loss: 0.7321 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7308 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7315 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7308 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7317 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7303 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7313 +INFO:master_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7310 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7309 +INFO:local_logger:Epoch[041/800], Step[0200/0626], Avg Loss: 0.7308 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7310 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7307 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7304 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7307 +INFO:master_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7308 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7310 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7308 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7312 +INFO:local_logger:Epoch[041/800], Step[0300/0626], Avg Loss: 0.7304 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7312 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7309 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7301 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7305 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7305 +INFO:master_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7306 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7307 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7303 +INFO:local_logger:Epoch[041/800], Step[0400/0626], Avg Loss: 0.7307 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7303 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7302 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7306 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7305 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7300 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7306 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7304 +INFO:local_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7304 +INFO:master_logger:Epoch[041/800], Step[0500/0626], Avg Loss: 0.7304 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7303 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7304 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7301 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7299 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7302 +INFO:master_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7301 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7299 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7302 +INFO:local_logger:Epoch[041/800], Step[0600/0626], Avg Loss: 0.7301 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7305, time: 866.86 +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7300, time: 866.90 +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7304, time: 867.40 +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7303, time: 863.76 +INFO:master_logger:----- Epoch[041/800], Train Loss: 0.7302, time: 863.76 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7301, time: 867.52 +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7303, time: 867.54 +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7302, time: 867.65 +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:----- Epoch[041/800], Train Loss: 0.7300, time: 867.66 +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-41-Loss-0.7302703064055491.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-41-Loss-0.7302703064055491.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-41-Loss-0.7302703064055491.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-41-Loss-0.7302703064055491.pdopt +INFO:local_logger:Now training epoch 42. LR=0.000150 +INFO:master_logger:Now training epoch 42. LR=0.000150 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7300 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7398 +INFO:master_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7281 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7322 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7185 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7206 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7231 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7259 +INFO:local_logger:Epoch[042/800], Step[0000/0626], Avg Loss: 0.7347 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7299 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7279 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7292 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7300 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7287 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7292 +INFO:master_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7291 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7283 +INFO:local_logger:Epoch[042/800], Step[0100/0626], Avg Loss: 0.7297 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7285 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7286 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7290 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7289 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7292 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7282 +INFO:master_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7286 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7284 +INFO:local_logger:Epoch[042/800], Step[0200/0626], Avg Loss: 0.7283 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7283 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7288 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7283 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7277 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7284 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7286 +INFO:master_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7285 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7287 +INFO:local_logger:Epoch[042/800], Step[0300/0626], Avg Loss: 0.7288 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7279 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7285 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7287 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7288 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7283 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7275 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7284 +INFO:master_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7283 +INFO:local_logger:Epoch[042/800], Step[0400/0626], Avg Loss: 0.7279 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7278 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7284 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7285 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7284 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7277 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7279 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7278 +INFO:local_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7285 +INFO:master_logger:Epoch[042/800], Step[0500/0626], Avg Loss: 0.7281 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7278 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7276 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7285 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7278 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7277 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7283 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7282 +INFO:master_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7280 +INFO:local_logger:Epoch[042/800], Step[0600/0626], Avg Loss: 0.7280 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7281, time: 892.19 +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7280, time: 892.20 +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7277, time: 892.57 +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7276, time: 893.34 +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7275, time: 892.63 +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7283, time: 889.09 +INFO:master_logger:----- Epoch[042/800], Train Loss: 0.7279, time: 889.09 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7282, time: 893.43 +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:----- Epoch[042/800], Train Loss: 0.7279, time: 892.89 +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-42-Loss-0.728333370828915.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-42-Loss-0.728333370828915.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-42-Loss-0.728333370828915.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-42-Loss-0.728333370828915.pdopt +INFO:local_logger:Now training epoch 43. LR=0.000150 +INFO:master_logger:Now training epoch 43. LR=0.000150 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7259 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7157 +INFO:master_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7299 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7296 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7389 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7312 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7351 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7178 +INFO:local_logger:Epoch[043/800], Step[0000/0626], Avg Loss: 0.7452 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7266 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7263 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7264 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7253 +INFO:master_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7262 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7269 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7261 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7256 +INFO:local_logger:Epoch[043/800], Step[0100/0626], Avg Loss: 0.7263 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7260 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7257 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7268 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7256 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7263 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7260 +INFO:master_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7260 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7257 +INFO:local_logger:Epoch[043/800], Step[0200/0626], Avg Loss: 0.7258 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7257 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7262 +INFO:master_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7259 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7260 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7256 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7264 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7262 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7257 +INFO:local_logger:Epoch[043/800], Step[0300/0626], Avg Loss: 0.7256 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7257 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7261 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7257 +INFO:master_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7258 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7257 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7254 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7253 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7260 +INFO:local_logger:Epoch[043/800], Step[0400/0626], Avg Loss: 0.7263 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7253 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7257 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7255 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7258 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7260 +INFO:master_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7256 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7253 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7255 +INFO:local_logger:Epoch[043/800], Step[0500/0626], Avg Loss: 0.7259 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7256 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7255 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7258 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7258 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7253 +INFO:master_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7256 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7254 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7252 +INFO:local_logger:Epoch[043/800], Step[0600/0626], Avg Loss: 0.7258 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7254, time: 856.34 +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7256, time: 855.79 +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7253, time: 856.30 +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7258, time: 856.24 +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7256, time: 852.55 +INFO:master_logger:----- Epoch[043/800], Train Loss: 0.7255, time: 852.55 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7252, time: 856.26 +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7258, time: 856.83 +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:----- Epoch[043/800], Train Loss: 0.7257, time: 856.33 +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-43-Loss-0.7255999422779928.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-43-Loss-0.7255999422779928.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-43-Loss-0.7255999422779928.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-43-Loss-0.7255999422779928.pdopt +INFO:local_logger:Now training epoch 44. LR=0.000150 +INFO:master_logger:Now training epoch 44. LR=0.000150 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7265 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7228 +INFO:master_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7233 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7197 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7206 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7307 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7300 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[044/800], Step[0000/0626], Avg Loss: 0.7161 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7253 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7241 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7232 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7238 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7251 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7248 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7246 +INFO:local_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7250 +INFO:master_logger:Epoch[044/800], Step[0100/0626], Avg Loss: 0.7245 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7238 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7246 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7237 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7243 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7241 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7251 +INFO:master_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7243 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7249 +INFO:local_logger:Epoch[044/800], Step[0200/0626], Avg Loss: 0.7239 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7239 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7241 +INFO:master_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7240 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7239 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7241 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7235 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7242 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7247 +INFO:local_logger:Epoch[044/800], Step[0300/0626], Avg Loss: 0.7236 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7238 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7236 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7241 +INFO:master_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7238 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7236 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7233 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7243 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7240 +INFO:local_logger:Epoch[044/800], Step[0400/0626], Avg Loss: 0.7239 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7237 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7240 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7234 +INFO:master_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7236 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7238 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7231 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7238 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7237 +INFO:local_logger:Epoch[044/800], Step[0500/0626], Avg Loss: 0.7235 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7236 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7237 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7235 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7235 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7231 +INFO:master_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7235 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7237 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7235 +INFO:local_logger:Epoch[044/800], Step[0600/0626], Avg Loss: 0.7233 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7236, time: 893.49 +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7231, time: 893.57 +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7234, time: 893.55 +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7234, time: 893.57 +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7235, time: 889.85 +INFO:master_logger:----- Epoch[044/800], Train Loss: 0.7235, time: 889.85 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7237, time: 894.16 +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7238, time: 894.14 +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:----- Epoch[044/800], Train Loss: 0.7235, time: 893.86 +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-44-Loss-0.723522978396613.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-44-Loss-0.723522978396613.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-44-Loss-0.723522978396613.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-44-Loss-0.723522978396613.pdopt +INFO:local_logger:Now training epoch 45. LR=0.000150 +INFO:master_logger:Now training epoch 45. LR=0.000150 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7270 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7223 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7158 +INFO:master_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7205 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7192 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7172 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7222 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7194 +INFO:local_logger:Epoch[045/800], Step[0000/0626], Avg Loss: 0.7208 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7221 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7229 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7221 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7228 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7213 +INFO:master_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7223 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7222 +INFO:local_logger:Epoch[045/800], Step[0100/0626], Avg Loss: 0.7230 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7218 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7221 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7231 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7222 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7228 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7222 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7222 +INFO:master_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7223 +INFO:local_logger:Epoch[045/800], Step[0200/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7220 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7218 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7220 +INFO:master_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7227 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7213 +INFO:local_logger:Epoch[045/800], Step[0300/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7217 +INFO:master_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7218 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7212 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7220 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0400/0626], Avg Loss: 0.7224 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7220 +INFO:master_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7218 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7216 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7218 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7213 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0500/0626], Avg Loss: 0.7223 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7212 +INFO:master_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7216 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7216 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7216 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7220 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7217 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7219 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7214 +INFO:local_logger:Epoch[045/800], Step[0600/0626], Avg Loss: 0.7213 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7212, time: 859.11 +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7212, time: 855.42 +INFO:master_logger:----- Epoch[045/800], Train Loss: 0.7216, time: 855.42 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7216, time: 859.01 +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7216, time: 859.54 +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7219, time: 859.82 +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7218, time: 859.92 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7214, time: 859.73 +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:----- Epoch[045/800], Train Loss: 0.7217, time: 859.75 +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-45-Loss-0.7212178304741391.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-45-Loss-0.7212178304741391.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-45-Loss-0.7212178304741391.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-45-Loss-0.7212178304741391.pdopt +INFO:local_logger:Now training epoch 46. LR=0.000150 +INFO:master_logger:Now training epoch 46. LR=0.000150 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7165 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7255 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7341 +INFO:master_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7218 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7214 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7314 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7055 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7244 +INFO:local_logger:Epoch[046/800], Step[0000/0626], Avg Loss: 0.7156 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7201 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7210 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7219 +INFO:master_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7206 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7207 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7203 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7207 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7196 +INFO:local_logger:Epoch[046/800], Step[0100/0626], Avg Loss: 0.7207 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7191 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7207 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7206 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7209 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7203 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7207 +INFO:master_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7204 +INFO:local_logger:Epoch[046/800], Step[0200/0626], Avg Loss: 0.7208 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7205 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7204 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7201 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7202 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7202 +INFO:local_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7202 +INFO:master_logger:Epoch[046/800], Step[0300/0626], Avg Loss: 0.7202 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7200 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7205 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7202 +INFO:master_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7200 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7201 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0400/0626], Avg Loss: 0.7200 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7197 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7200 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7201 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7198 +INFO:master_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[046/800], Step[0500/0626], Avg Loss: 0.7204 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7197 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7203 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7197 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7199 +INFO:master_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[046/800], Step[0600/0626], Avg Loss: 0.7197 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7199, time: 894.85 +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7197, time: 894.90 +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7198, time: 895.63 +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7197, time: 895.84 +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7197, time: 895.79 +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7198, time: 892.44 +INFO:master_logger:----- Epoch[046/800], Train Loss: 0.7198, time: 892.44 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7198, time: 895.48 +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:----- Epoch[046/800], Train Loss: 0.7204, time: 895.48 +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-46-Loss-0.7197591380592546.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-46-Loss-0.7197591380592546.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-46-Loss-0.7197591380592546.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-46-Loss-0.7197591380592546.pdopt +INFO:local_logger:Now training epoch 47. LR=0.000150 +INFO:master_logger:Now training epoch 47. LR=0.000150 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7141 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7210 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7275 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7216 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7156 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7147 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7261 +INFO:local_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7192 +INFO:master_logger:Epoch[047/800], Step[0000/0626], Avg Loss: 0.7199 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7184 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7187 +INFO:master_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7189 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7194 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7183 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7189 +INFO:local_logger:Epoch[047/800], Step[0100/0626], Avg Loss: 0.7196 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7184 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7190 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7180 +INFO:master_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7185 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7195 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7186 +INFO:local_logger:Epoch[047/800], Step[0200/0626], Avg Loss: 0.7192 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7182 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7187 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7182 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7179 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7184 +INFO:master_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7184 +INFO:local_logger:Epoch[047/800], Step[0300/0626], Avg Loss: 0.7184 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7183 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7180 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7188 +INFO:master_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7184 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7183 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7185 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0400/0626], Avg Loss: 0.7178 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7187 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7189 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7179 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7187 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7182 +INFO:master_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7183 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7179 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7183 +INFO:local_logger:Epoch[047/800], Step[0500/0626], Avg Loss: 0.7180 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7181 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7185 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7185 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7188 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7180 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7179 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7177 +INFO:local_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7181 +INFO:master_logger:Epoch[047/800], Step[0600/0626], Avg Loss: 0.7182 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7188, time: 864.28 +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7180, time: 863.82 +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7183, time: 863.91 +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7185, time: 864.34 +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7177, time: 864.67 +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7179, time: 864.05 +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7182, time: 864.42 +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:----- Epoch[047/800], Train Loss: 0.7181, time: 860.27 +INFO:master_logger:----- Epoch[047/800], Train Loss: 0.7182, time: 860.27 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-47-Loss-0.7181137765215982.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-47-Loss-0.7181137765215982.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-47-Loss-0.7181137765215982.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-47-Loss-0.7181137765215982.pdopt +INFO:local_logger:Now training epoch 48. LR=0.000150 +INFO:master_logger:Now training epoch 48. LR=0.000150 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7296 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7261 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7148 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7056 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7238 +INFO:master_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7205 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7255 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7134 +INFO:local_logger:Epoch[048/800], Step[0000/0626], Avg Loss: 0.7255 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7177 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7189 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7174 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7174 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7175 +INFO:master_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7177 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7172 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7192 +INFO:local_logger:Epoch[048/800], Step[0100/0626], Avg Loss: 0.7164 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7174 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7176 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7172 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7181 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7172 +INFO:master_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7174 +INFO:local_logger:Epoch[048/800], Step[0200/0626], Avg Loss: 0.7183 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7169 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7173 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7171 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7171 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7168 +INFO:master_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7170 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7172 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7164 +INFO:local_logger:Epoch[048/800], Step[0300/0626], Avg Loss: 0.7176 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7171 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7171 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7172 +INFO:master_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7170 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7173 +INFO:local_logger:Epoch[048/800], Step[0400/0626], Avg Loss: 0.7170 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7171 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7169 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7167 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7169 +INFO:master_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7164 +INFO:local_logger:Epoch[048/800], Step[0500/0626], Avg Loss: 0.7170 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7165 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7168 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7162 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7166 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7168 +INFO:master_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7166 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7166 +INFO:local_logger:Epoch[048/800], Step[0600/0626], Avg Loss: 0.7168 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7162, time: 894.29 +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7165, time: 894.54 +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7169, time: 894.25 +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7165, time: 894.58 +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7168, time: 894.96 +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7165, time: 891.19 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7169, time: 894.94 +INFO:master_logger:----- Epoch[048/800], Train Loss: 0.7166, time: 891.19 +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:----- Epoch[048/800], Train Loss: 0.7167, time: 895.17 +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-48-Loss-0.7164831838117034.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-48-Loss-0.7164831838117034.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-48-Loss-0.7164831838117034.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-48-Loss-0.7164831838117034.pdopt +INFO:local_logger:Now training epoch 49. LR=0.000150 +INFO:master_logger:Now training epoch 49. LR=0.000150 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7106 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7136 +INFO:master_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7139 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7180 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7324 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7095 +INFO:local_logger:Epoch[049/800], Step[0000/0626], Avg Loss: 0.7095 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7149 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7149 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7153 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7157 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7161 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7156 +INFO:master_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7153 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7150 +INFO:local_logger:Epoch[049/800], Step[0100/0626], Avg Loss: 0.7151 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7156 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7158 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7151 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7161 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7155 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7153 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7156 +INFO:local_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7152 +INFO:master_logger:Epoch[049/800], Step[0200/0626], Avg Loss: 0.7155 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7155 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7153 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7159 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7162 +INFO:master_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7156 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7157 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7152 +INFO:local_logger:Epoch[049/800], Step[0300/0626], Avg Loss: 0.7156 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7155 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7150 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7160 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7158 +INFO:master_logger:Epoch[049/800], Step[0400/0626], Avg Loss: 0.7155 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7151 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7156 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7155 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7156 +INFO:master_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7153 +INFO:local_logger:Epoch[049/800], Step[0500/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7151 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7151 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7153 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7154 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7152 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7151 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7150 +INFO:master_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7152 +INFO:local_logger:Epoch[049/800], Step[0600/0626], Avg Loss: 0.7153 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7150, time: 858.69 +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7151, time: 857.91 +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7152, time: 858.69 +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7152, time: 857.99 +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7151, time: 858.78 +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7151, time: 858.00 +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7154, time: 858.39 +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:----- Epoch[049/800], Train Loss: 0.7153, time: 854.33 +INFO:master_logger:----- Epoch[049/800], Train Loss: 0.7152, time: 854.33 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-49-Loss-0.715259619077928.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-49-Loss-0.715259619077928.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-49-Loss-0.715259619077928.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-49-Loss-0.715259619077928.pdopt +INFO:local_logger:Now training epoch 50. LR=0.000150 +INFO:master_logger:Now training epoch 50. LR=0.000150 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7160 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7233 +INFO:master_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7153 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7204 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7224 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7126 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7130 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7084 +INFO:local_logger:Epoch[050/800], Step[0000/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7145 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7148 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7134 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7152 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7152 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7142 +INFO:master_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7145 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7144 +INFO:local_logger:Epoch[050/800], Step[0100/0626], Avg Loss: 0.7141 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7143 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7138 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7142 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7137 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7148 +INFO:master_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7142 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7145 +INFO:local_logger:Epoch[050/800], Step[0200/0626], Avg Loss: 0.7143 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7144 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7141 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7143 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7141 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7143 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7137 +INFO:local_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7137 +INFO:master_logger:Epoch[050/800], Step[0300/0626], Avg Loss: 0.7141 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7139 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7143 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7136 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7137 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7139 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7139 +INFO:local_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7142 +INFO:master_logger:Epoch[050/800], Step[0400/0626], Avg Loss: 0.7139 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7137 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7142 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7137 +INFO:master_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7139 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7136 +INFO:local_logger:Epoch[050/800], Step[0500/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7139 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7142 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7138 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7141 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7141 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7141 +INFO:master_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[050/800], Step[0600/0626], Avg Loss: 0.7141 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7141, time: 882.33 +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7141, time: 882.32 +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7139, time: 878.40 +INFO:master_logger:----- Epoch[050/800], Train Loss: 0.7140, time: 878.40 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7144, time: 882.28 +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7139, time: 882.66 +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7140, time: 882.66 +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7138, time: 882.67 +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:----- Epoch[050/800], Train Loss: 0.7140, time: 882.77 +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-50-Loss-0.7138677369669435.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-50-Loss-0.7138677369669435.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-50-Loss-0.7138677369669435.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-50-Loss-0.7138677369669435.pdopt +INFO:local_logger:Now training epoch 51. LR=0.000150 +INFO:master_logger:Now training epoch 51. LR=0.000150 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7092 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7248 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7198 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7048 +INFO:master_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7142 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7200 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7138 +INFO:local_logger:Epoch[051/800], Step[0000/0626], Avg Loss: 0.7145 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7135 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7127 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7135 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7137 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7118 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7132 +INFO:master_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7129 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7122 +INFO:local_logger:Epoch[051/800], Step[0100/0626], Avg Loss: 0.7129 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7131 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7137 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7133 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7130 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7130 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7133 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7129 +INFO:master_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7130 +INFO:local_logger:Epoch[051/800], Step[0200/0626], Avg Loss: 0.7120 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7131 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7130 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7125 +INFO:master_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7129 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7131 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7127 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7129 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7128 +INFO:local_logger:Epoch[051/800], Step[0300/0626], Avg Loss: 0.7129 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7126 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7126 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7130 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7130 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7128 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7122 +INFO:master_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7128 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7131 +INFO:local_logger:Epoch[051/800], Step[0400/0626], Avg Loss: 0.7127 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7124 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7128 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7127 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7122 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7129 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7129 +INFO:master_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7126 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7126 +INFO:local_logger:Epoch[051/800], Step[0500/0626], Avg Loss: 0.7126 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7126 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7128 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7125 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7128 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7128 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7122 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7120 +INFO:master_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7125 +INFO:local_logger:Epoch[051/800], Step[0600/0626], Avg Loss: 0.7125 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7127, time: 848.53 +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7120, time: 845.42 +INFO:master_logger:----- Epoch[051/800], Train Loss: 0.7125, time: 845.42 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7125, time: 849.57 +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7126, time: 849.50 +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7127, time: 849.57 +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7126, time: 849.14 +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7122, time: 849.16 +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:----- Epoch[051/800], Train Loss: 0.7126, time: 849.16 +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-51-Loss-0.7120018708518765.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-51-Loss-0.7120018708518765.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-51-Loss-0.7120018708518765.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-51-Loss-0.7120018708518765.pdopt +INFO:local_logger:Now training epoch 52. LR=0.000150 +INFO:master_logger:Now training epoch 52. LR=0.000150 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7075 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7092 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7105 +INFO:master_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7076 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.6988 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7050 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7185 +INFO:local_logger:Epoch[052/800], Step[0000/0626], Avg Loss: 0.7081 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7119 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7123 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7110 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7121 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7118 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7113 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7116 +INFO:master_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7117 +INFO:local_logger:Epoch[052/800], Step[0100/0626], Avg Loss: 0.7119 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7110 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7116 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7122 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7114 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7115 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7111 +INFO:master_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7115 +INFO:local_logger:Epoch[052/800], Step[0200/0626], Avg Loss: 0.7123 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7116 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7117 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7109 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7114 +INFO:master_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7113 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7111 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7113 +INFO:local_logger:Epoch[052/800], Step[0300/0626], Avg Loss: 0.7116 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7117 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7113 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7110 +INFO:master_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7111 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7107 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7117 +INFO:local_logger:Epoch[052/800], Step[0400/0626], Avg Loss: 0.7111 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7107 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7115 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7114 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7107 +INFO:master_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7111 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7109 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7116 +INFO:local_logger:Epoch[052/800], Step[0500/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7106 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7113 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7114 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7109 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7113 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7110 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7113 +INFO:master_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7111 +INFO:local_logger:Epoch[052/800], Step[0600/0626], Avg Loss: 0.7107 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7107, time: 899.12 +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7109, time: 899.76 +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7113, time: 899.78 +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7113, time: 900.39 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7114, time: 896.39 +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:master_logger:----- Epoch[052/800], Train Loss: 0.7110, time: 896.39 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7109, time: 899.78 +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7105, time: 899.79 +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:local_logger:----- Epoch[052/800], Train Loss: 0.7112, time: 899.77 +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-52-Loss-0.7113885321068728.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-52-Loss-0.7113885321068728.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-52-Loss-0.7113885321068728.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-52-Loss-0.7113885321068728.pdopt +INFO:local_logger:Now training epoch 53. LR=0.000150 +INFO:master_logger:Now training epoch 53. LR=0.000150 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7064 +INFO:master_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7098 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7075 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7109 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7135 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7272 +INFO:local_logger:Epoch[053/800], Step[0000/0626], Avg Loss: 0.7107 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7100 +INFO:master_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7105 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7106 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7117 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7093 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7106 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7110 +INFO:local_logger:Epoch[053/800], Step[0100/0626], Avg Loss: 0.7096 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7106 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7117 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7098 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7105 +INFO:master_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0200/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7105 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7101 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7099 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7107 +INFO:master_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7101 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7099 +INFO:local_logger:Epoch[053/800], Step[0300/0626], Avg Loss: 0.7101 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7096 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7104 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7102 +INFO:master_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7104 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7101 +INFO:local_logger:Epoch[053/800], Step[0400/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7105 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7104 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7101 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7096 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7102 +INFO:master_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0500/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7101 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7100 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7102 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7101 +INFO:master_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7100 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7101 +INFO:local_logger:Epoch[053/800], Step[0600/0626], Avg Loss: 0.7098 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7102, time: 889.34 +INFO:master_logger:----- Epoch[053/800], Train Loss: 0.7100, time: 889.34 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7102, time: 893.08 +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7100, time: 893.08 +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7094, time: 893.08 +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7100, time: 893.10 +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7100, time: 893.10 +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7100, time: 893.75 +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:----- Epoch[053/800], Train Loss: 0.7098, time: 893.11 +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-53-Loss-0.7102464560284915.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-53-Loss-0.7102464560284915.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-53-Loss-0.7102464560284915.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-53-Loss-0.7102464560284915.pdopt +INFO:local_logger:Now training epoch 54. LR=0.000150 +INFO:master_logger:Now training epoch 54. LR=0.000150 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7124 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7103 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7021 +INFO:master_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7254 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7077 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7109 +INFO:local_logger:Epoch[054/800], Step[0000/0626], Avg Loss: 0.7063 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7105 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7093 +INFO:master_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7095 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7089 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7095 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7097 +INFO:local_logger:Epoch[054/800], Step[0100/0626], Avg Loss: 0.7087 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7082 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7086 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7092 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7092 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7087 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7085 +INFO:master_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0200/0626], Avg Loss: 0.7093 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7089 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7089 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7089 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7085 +INFO:master_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0300/0626], Avg Loss: 0.7092 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7089 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7090 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7083 +INFO:master_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7090 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7091 +INFO:local_logger:Epoch[054/800], Step[0400/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7087 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7084 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7085 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7087 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7090 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7090 +INFO:master_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7086 +INFO:local_logger:Epoch[054/800], Step[0500/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7085 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7087 +INFO:master_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7085 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7081 +INFO:local_logger:Epoch[054/800], Step[0600/0626], Avg Loss: 0.7081 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7087, time: 903.80 +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7082, time: 903.85 +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7083, time: 904.37 +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7081, time: 904.37 +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7085, time: 904.35 +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7087, time: 904.43 +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7088, time: 900.86 +INFO:master_logger:----- Epoch[054/800], Train Loss: 0.7085, time: 900.86 +INFO:local_logger:----- Epoch[054/800], Train Loss: 0.7087, time: 904.47 +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-54-Loss-0.7087632936406654.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-54-Loss-0.7087632936406654.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-54-Loss-0.7087632936406654.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-54-Loss-0.7087632936406654.pdopt +INFO:local_logger:Now training epoch 55. LR=0.000150 +INFO:master_logger:Now training epoch 55. LR=0.000150 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.6897 +INFO:master_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.7023 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.6937 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.7040 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.7203 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.7165 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[055/800], Step[0000/0626], Avg Loss: 0.7108 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7093 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7084 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7081 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7084 +INFO:master_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7082 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7082 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7078 +INFO:local_logger:Epoch[055/800], Step[0100/0626], Avg Loss: 0.7078 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7086 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7080 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7086 +INFO:master_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7081 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7088 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7078 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7078 +INFO:local_logger:Epoch[055/800], Step[0200/0626], Avg Loss: 0.7077 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7071 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7086 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7076 +INFO:master_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7080 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7086 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7078 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[055/800], Step[0300/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7073 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7081 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7076 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7078 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7087 +INFO:master_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[055/800], Step[0400/0626], Avg Loss: 0.7081 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7087 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7077 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7082 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7076 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7073 +INFO:master_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7078 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[055/800], Step[0500/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7073 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7081 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7072 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7084 +INFO:master_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7076 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7077 +INFO:local_logger:Epoch[055/800], Step[0600/0626], Avg Loss: 0.7074 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7081, time: 859.40 +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7074, time: 855.79 +INFO:master_logger:----- Epoch[055/800], Train Loss: 0.7076, time: 855.79 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7073, time: 859.94 +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7073, time: 859.85 +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7072, time: 860.45 +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7074, time: 859.97 +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7077, time: 859.89 +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:----- Epoch[055/800], Train Loss: 0.7083, time: 860.52 +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-55-Loss-0.7074442110429905.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-55-Loss-0.7074442110429905.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-55-Loss-0.7074442110429905.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-55-Loss-0.7074442110429905.pdopt +INFO:local_logger:Now training epoch 56. LR=0.000150 +INFO:master_logger:Now training epoch 56. LR=0.000150 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.6969 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.7058 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.7068 +INFO:master_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.7070 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.7114 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.7124 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.6985 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.7150 +INFO:local_logger:Epoch[056/800], Step[0000/0626], Avg Loss: 0.7089 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7079 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7056 +INFO:master_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[056/800], Step[0100/0626], Avg Loss: 0.7070 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7071 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7075 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7059 +INFO:master_logger:Epoch[056/800], Step[0200/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7077 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7064 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7067 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7067 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7059 +INFO:local_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7065 +INFO:master_logger:Epoch[056/800], Step[0300/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7060 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7065 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7073 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7072 +INFO:master_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7067 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7063 +INFO:local_logger:Epoch[056/800], Step[0400/0626], Avg Loss: 0.7067 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7063 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7069 +INFO:master_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7067 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[056/800], Step[0500/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7066 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7069 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7063 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7061 +INFO:master_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7065 +INFO:local_logger:Epoch[056/800], Step[0600/0626], Avg Loss: 0.7065 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7067, time: 890.40 +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7065, time: 890.61 +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7066, time: 890.94 +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7068, time: 891.55 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7063, time: 890.99 +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7060, time: 891.01 +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7063, time: 887.67 +INFO:master_logger:----- Epoch[056/800], Train Loss: 0.7065, time: 887.67 +INFO:local_logger:----- Epoch[056/800], Train Loss: 0.7069, time: 890.98 +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-56-Loss-0.7062517588045653.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-56-Loss-0.7062517588045653.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-56-Loss-0.7062517588045653.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-56-Loss-0.7062517588045653.pdopt +INFO:local_logger:Now training epoch 57. LR=0.000150 +INFO:master_logger:Now training epoch 57. LR=0.000150 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7140 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7131 +INFO:master_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7096 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7165 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7022 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[057/800], Step[0000/0626], Avg Loss: 0.7171 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7063 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7054 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7067 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7053 +INFO:local_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7065 +INFO:master_logger:Epoch[057/800], Step[0100/0626], Avg Loss: 0.7061 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7056 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7067 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7058 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7051 +INFO:master_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7059 +INFO:local_logger:Epoch[057/800], Step[0200/0626], Avg Loss: 0.7061 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7056 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7059 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7052 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7058 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7054 +INFO:master_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7054 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7063 +INFO:local_logger:Epoch[057/800], Step[0300/0626], Avg Loss: 0.7061 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7058 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7060 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7052 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7059 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7056 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7063 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7055 +INFO:master_logger:Epoch[057/800], Step[0400/0626], Avg Loss: 0.7058 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7054 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7050 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7059 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7055 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7061 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7059 +INFO:master_logger:Epoch[057/800], Step[0500/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7051 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7058 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7054 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7054 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7062 +INFO:local_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7056 +INFO:master_logger:Epoch[057/800], Step[0600/0626], Avg Loss: 0.7057 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7060, time: 853.51 +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7053, time: 853.54 +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7057, time: 854.16 +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7050, time: 854.00 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7061, time: 854.36 +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7056, time: 849.91 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7054, time: 853.96 +INFO:master_logger:----- Epoch[057/800], Train Loss: 0.7056, time: 849.91 +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:----- Epoch[057/800], Train Loss: 0.7056, time: 853.96 +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-57-Loss-0.70561545900947.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-57-Loss-0.70561545900947.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-57-Loss-0.70561545900947.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-57-Loss-0.70561545900947.pdopt +INFO:local_logger:Now training epoch 58. LR=0.000150 +INFO:master_logger:Now training epoch 58. LR=0.000150 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.7087 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.6950 +INFO:master_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.7032 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.7008 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.6990 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.7091 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.7010 +INFO:local_logger:Epoch[058/800], Step[0000/0626], Avg Loss: 0.7083 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7057 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7047 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7033 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7044 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7042 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7048 +INFO:local_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7051 +INFO:master_logger:Epoch[058/800], Step[0100/0626], Avg Loss: 0.7045 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7056 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7048 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7050 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7052 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7048 +INFO:master_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7048 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7051 +INFO:local_logger:Epoch[058/800], Step[0200/0626], Avg Loss: 0.7032 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7048 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7052 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7051 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7046 +INFO:master_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7047 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7049 +INFO:local_logger:Epoch[058/800], Step[0300/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7047 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7047 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7048 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7048 +INFO:master_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7047 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0400/0626], Avg Loss: 0.7047 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7048 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7043 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7034 +INFO:master_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7044 +INFO:local_logger:Epoch[058/800], Step[0500/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7045 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7045 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7046 +INFO:master_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7044 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7045 +INFO:local_logger:Epoch[058/800], Step[0600/0626], Avg Loss: 0.7042 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7045, time: 883.86 +INFO:master_logger:----- Epoch[058/800], Train Loss: 0.7044, time: 883.86 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7044, time: 888.09 +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7043, time: 888.09 +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7045, time: 887.67 +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7036, time: 887.78 +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7046, time: 888.37 +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7045, time: 887.96 +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:----- Epoch[058/800], Train Loss: 0.7046, time: 887.96 +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-58-Loss-0.704508643255555.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-58-Loss-0.704508643255555.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-58-Loss-0.704508643255555.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-58-Loss-0.704508643255555.pdopt +INFO:local_logger:Now training epoch 59. LR=0.000151 +INFO:master_logger:Now training epoch 59. LR=0.000151 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.6945 +INFO:master_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.7041 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.7028 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.7075 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.7132 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.7134 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.7138 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[059/800], Step[0000/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7022 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7047 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7039 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7043 +INFO:master_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7041 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7048 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7046 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7044 +INFO:local_logger:Epoch[059/800], Step[0100/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7040 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7040 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7041 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7032 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7039 +INFO:master_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7037 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7021 +INFO:local_logger:Epoch[059/800], Step[0200/0626], Avg Loss: 0.7043 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7041 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7040 +INFO:master_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7033 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7027 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7040 +INFO:local_logger:Epoch[059/800], Step[0300/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7037 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7040 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7029 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7037 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7033 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7039 +INFO:master_logger:Epoch[059/800], Step[0400/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7037 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7039 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7031 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7037 +INFO:master_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7031 +INFO:local_logger:Epoch[059/800], Step[0500/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7037 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7035 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7034 +INFO:master_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7037 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7032 +INFO:local_logger:Epoch[059/800], Step[0600/0626], Avg Loss: 0.7030 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7034, time: 855.63 +INFO:master_logger:----- Epoch[059/800], Train Loss: 0.7034, time: 855.63 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7034, time: 859.41 +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7035, time: 859.16 +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7030, time: 859.83 +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7033, time: 859.66 +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7036, time: 859.94 +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7037, time: 859.65 +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:----- Epoch[059/800], Train Loss: 0.7032, time: 859.97 +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-59-Loss-0.7033895635093321.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-59-Loss-0.7033895635093321.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-59-Loss-0.7033895635093321.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-59-Loss-0.7033895635093321.pdopt +INFO:local_logger:Now training epoch 60. LR=0.000151 +INFO:master_logger:Now training epoch 60. LR=0.000151 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.7205 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.7122 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.7150 +INFO:master_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.7093 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.7068 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.7212 +INFO:local_logger:Epoch[060/800], Step[0000/0626], Avg Loss: 0.6992 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7019 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7031 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7027 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7041 +INFO:master_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7029 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[060/800], Step[0100/0626], Avg Loss: 0.7037 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7033 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7022 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7043 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7030 +INFO:master_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7029 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[060/800], Step[0200/0626], Avg Loss: 0.7033 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7030 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7022 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7029 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7034 +INFO:master_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7028 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7027 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[060/800], Step[0300/0626], Avg Loss: 0.7030 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7031 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7029 +INFO:master_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7028 +INFO:local_logger:Epoch[060/800], Step[0400/0626], Avg Loss: 0.7023 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7023 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7029 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7029 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7028 +INFO:local_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7022 +INFO:master_logger:Epoch[060/800], Step[0500/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7029 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7025 +INFO:master_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7023 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7021 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7023 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[060/800], Step[0600/0626], Avg Loss: 0.7028 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7020, time: 883.22 +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7027, time: 884.34 +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7025, time: 883.83 +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7022, time: 883.86 +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7030, time: 883.87 +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7023, time: 883.92 +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7024, time: 883.95 +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:----- Epoch[060/800], Train Loss: 0.7025, time: 880.74 +INFO:master_logger:----- Epoch[060/800], Train Loss: 0.7024, time: 880.74 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-60-Loss-0.7024735177213701.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-60-Loss-0.7024735177213701.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-60-Loss-0.7024735177213701.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-60-Loss-0.7024735177213701.pdopt +INFO:local_logger:Now training epoch 61. LR=0.000151 +INFO:master_logger:Now training epoch 61. LR=0.000151 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.6940 +INFO:master_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.6741 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.7017 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[061/800], Step[0000/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7019 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7020 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7019 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7011 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7002 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7016 +INFO:master_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7018 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[061/800], Step[0100/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7020 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7028 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7020 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7015 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7027 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7022 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7017 +INFO:local_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7011 +INFO:master_logger:Epoch[061/800], Step[0200/0626], Avg Loss: 0.7020 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7016 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7010 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7024 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7020 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7020 +INFO:master_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7018 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7017 +INFO:local_logger:Epoch[061/800], Step[0300/0626], Avg Loss: 0.7026 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7012 +INFO:master_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7018 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7019 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7021 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7020 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7022 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7016 +INFO:local_logger:Epoch[061/800], Step[0400/0626], Avg Loss: 0.7019 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7013 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7017 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7019 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7022 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7022 +INFO:master_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7017 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7018 +INFO:local_logger:Epoch[061/800], Step[0500/0626], Avg Loss: 0.7015 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7021 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7013 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7018 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7013 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7015 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7014 +INFO:master_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7016 +INFO:local_logger:Epoch[061/800], Step[0600/0626], Avg Loss: 0.7019 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7015, time: 862.93 +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7014, time: 863.80 +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7013, time: 863.80 +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7012, time: 860.38 +INFO:master_logger:----- Epoch[061/800], Train Loss: 0.7015, time: 860.38 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7017, time: 864.16 +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7012, time: 864.25 +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7019, time: 865.44 +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:----- Epoch[061/800], Train Loss: 0.7021, time: 864.25 +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-61-Loss-0.7012385332086987.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-61-Loss-0.7012385332086987.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-61-Loss-0.7012385332086987.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-61-Loss-0.7012385332086987.pdopt +INFO:local_logger:Now training epoch 62. LR=0.000151 +INFO:master_logger:Now training epoch 62. LR=0.000151 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.7062 +INFO:master_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.7003 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.7018 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.7010 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.7098 +INFO:local_logger:Epoch[062/800], Step[0000/0626], Avg Loss: 0.7053 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7008 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7009 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7013 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7017 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.6986 +INFO:master_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0100/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.7008 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.7011 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.6996 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.7010 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.7009 +INFO:master_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0200/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7011 +INFO:master_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7007 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7008 +INFO:local_logger:Epoch[062/800], Step[0300/0626], Avg Loss: 0.7007 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7012 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7000 +INFO:master_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7008 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[062/800], Step[0400/0626], Avg Loss: 0.7011 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7002 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7014 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7009 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7009 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7002 +INFO:master_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0500/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7014 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7008 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7008 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7006 +INFO:master_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7002 +INFO:local_logger:Epoch[062/800], Step[0600/0626], Avg Loss: 0.7006 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7001, time: 880.83 +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7001, time: 877.21 +INFO:master_logger:----- Epoch[062/800], Train Loss: 0.7005, time: 877.21 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7008, time: 880.96 +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7008, time: 881.01 +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7006, time: 881.03 +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7005, time: 881.40 +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7001, time: 881.51 +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:----- Epoch[062/800], Train Loss: 0.7013, time: 882.39 +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-62-Loss-0.7001281701651857.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-62-Loss-0.7001281701651857.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-62-Loss-0.7001281701651857.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-62-Loss-0.7001281701651857.pdopt +INFO:local_logger:Now training epoch 63. LR=0.000151 +INFO:master_logger:Now training epoch 63. LR=0.000151 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.7051 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.7172 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.6949 +INFO:master_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.7023 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.7033 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[063/800], Step[0000/0626], Avg Loss: 0.6763 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.7001 +INFO:master_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.7002 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0100/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.6985 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.7006 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.6999 +INFO:master_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[063/800], Step[0200/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.7002 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.7013 +INFO:master_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.7002 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.6989 +INFO:local_logger:Epoch[063/800], Step[0300/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.6996 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.7011 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.7001 +INFO:master_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[063/800], Step[0400/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.7001 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.7008 +INFO:master_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[063/800], Step[0500/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6993 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6993 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.7005 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6997 +INFO:master_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[063/800], Step[0600/0626], Avg Loss: 0.6998 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.6997, time: 870.66 +INFO:master_logger:----- Epoch[063/800], Train Loss: 0.6997, time: 870.66 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.6998, time: 874.64 +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.6993, time: 874.57 +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.6998, time: 874.56 +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.6997, time: 874.53 +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.6993, time: 874.53 +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.6998, time: 874.79 +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:----- Epoch[063/800], Train Loss: 0.7004, time: 874.66 +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-63-Loss-0.6997380468063858.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-63-Loss-0.6997380468063858.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-63-Loss-0.6997380468063858.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-63-Loss-0.6997380468063858.pdopt +INFO:local_logger:Now training epoch 64. LR=0.000151 +INFO:master_logger:Now training epoch 64. LR=0.000151 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.7015 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.6893 +INFO:master_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.7009 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.7021 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.7124 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.7052 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.7017 +INFO:local_logger:Epoch[064/800], Step[0000/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6988 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6993 +INFO:master_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6986 +INFO:local_logger:Epoch[064/800], Step[0100/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.6987 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.7007 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.6998 +INFO:master_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[064/800], Step[0200/0626], Avg Loss: 0.7003 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6992 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6999 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6986 +INFO:master_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6993 +INFO:local_logger:Epoch[064/800], Step[0300/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6996 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6997 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6996 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6992 +INFO:master_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6993 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6992 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6992 +INFO:local_logger:Epoch[064/800], Step[0400/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6989 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6983 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6992 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6996 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6988 +INFO:master_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[064/800], Step[0500/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6991 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6992 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6987 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6993 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6989 +INFO:master_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6990 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6988 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6993 +INFO:local_logger:Epoch[064/800], Step[0600/0626], Avg Loss: 0.6984 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6992, time: 883.12 +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6983, time: 883.49 +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6991, time: 883.58 +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6987, time: 883.60 +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6992, time: 880.22 +INFO:master_logger:----- Epoch[064/800], Train Loss: 0.6990, time: 880.22 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6988, time: 883.73 +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6989, time: 883.71 +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:----- Epoch[064/800], Train Loss: 0.6993, time: 883.76 +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-64-Loss-0.6992388257000982.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-64-Loss-0.6992388257000982.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-64-Loss-0.6992388257000982.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-64-Loss-0.6992388257000982.pdopt +INFO:local_logger:Now training epoch 65. LR=0.000151 +INFO:master_logger:Now training epoch 65. LR=0.000151 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.7020 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.7021 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.7097 +INFO:master_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.7040 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.6956 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[065/800], Step[0000/0626], Avg Loss: 0.6983 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6990 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6990 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6976 +INFO:master_logger:Epoch[065/800], Step[0100/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6989 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6979 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6986 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6989 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6983 +INFO:master_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6985 +INFO:local_logger:Epoch[065/800], Step[0200/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6983 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6990 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6986 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6982 +INFO:master_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[065/800], Step[0300/0626], Avg Loss: 0.6979 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6985 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6990 +INFO:master_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6982 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6978 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0400/0626], Avg Loss: 0.6982 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6983 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6990 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6980 +INFO:master_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0500/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6982 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6982 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6988 +INFO:master_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6978 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6975 +INFO:local_logger:Epoch[065/800], Step[0600/0626], Avg Loss: 0.6981 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6981, time: 871.05 +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6988, time: 867.64 +INFO:master_logger:----- Epoch[065/800], Train Loss: 0.6980, time: 867.64 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6977, time: 871.33 +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6980, time: 872.01 +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6975, time: 871.43 +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6982, time: 871.92 +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6981, time: 871.81 +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:----- Epoch[065/800], Train Loss: 0.6978, time: 871.98 +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-65-Loss-0.6988199688404415.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-65-Loss-0.6988199688404415.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-65-Loss-0.6988199688404415.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-65-Loss-0.6988199688404415.pdopt +INFO:local_logger:Now training epoch 66. LR=0.000151 +INFO:master_logger:Now training epoch 66. LR=0.000151 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.6997 +INFO:master_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.6995 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.6986 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.7179 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.7034 +INFO:local_logger:Epoch[066/800], Step[0000/0626], Avg Loss: 0.6970 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6978 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6987 +INFO:master_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6979 +INFO:local_logger:Epoch[066/800], Step[0100/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6971 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6974 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6988 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6983 +INFO:master_logger:Epoch[066/800], Step[0200/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6978 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6971 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6975 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6984 +INFO:master_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[066/800], Step[0300/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6975 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6974 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6970 +INFO:master_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[066/800], Step[0400/0626], Avg Loss: 0.6979 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6978 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6971 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6970 +INFO:master_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6974 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6975 +INFO:local_logger:Epoch[066/800], Step[0500/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6971 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6970 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6970 +INFO:local_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6975 +INFO:master_logger:Epoch[066/800], Step[0600/0626], Avg Loss: 0.6973 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6970, time: 874.18 +INFO:master_logger:----- Epoch[066/800], Train Loss: 0.6973, time: 874.18 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6976, time: 878.36 +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6969, time: 877.55 +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6971, time: 878.04 +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6977, time: 877.55 +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6970, time: 878.01 +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6975, time: 878.03 +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:----- Epoch[066/800], Train Loss: 0.6975, time: 878.01 +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-66-Loss-0.6970274622691075.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-66-Loss-0.6970274622691075.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-66-Loss-0.6970274622691075.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-66-Loss-0.6970274622691075.pdopt +INFO:local_logger:Now training epoch 67. LR=0.000151 +INFO:master_logger:Now training epoch 67. LR=0.000151 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6978 +INFO:master_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.7094 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6943 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[067/800], Step[0000/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6981 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6969 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6956 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6962 +INFO:master_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[067/800], Step[0100/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6974 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6971 +INFO:master_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6974 +INFO:local_logger:Epoch[067/800], Step[0200/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6969 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6967 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6964 +INFO:master_logger:Epoch[067/800], Step[0300/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6964 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6971 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6967 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6966 +INFO:master_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[067/800], Step[0400/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6959 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6964 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6968 +INFO:master_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[067/800], Step[0500/0626], Avg Loss: 0.6971 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6970 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6970 +INFO:master_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[067/800], Step[0600/0626], Avg Loss: 0.6961 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6970, time: 875.35 +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6968, time: 872.12 +INFO:master_logger:----- Epoch[067/800], Train Loss: 0.6965, time: 872.12 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6965, time: 876.01 +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6965, time: 875.96 +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6970, time: 875.59 +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6961, time: 876.05 +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6965, time: 875.92 +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:----- Epoch[067/800], Train Loss: 0.6960, time: 876.13 +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-67-Loss-0.696807305239932.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-67-Loss-0.696807305239932.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-67-Loss-0.696807305239932.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-67-Loss-0.696807305239932.pdopt +INFO:local_logger:Now training epoch 68. LR=0.000151 +INFO:master_logger:Now training epoch 68. LR=0.000151 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.6982 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.7004 +INFO:master_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.6972 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.7036 +INFO:local_logger:Epoch[068/800], Step[0000/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6969 +INFO:master_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6964 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6970 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[068/800], Step[0100/0626], Avg Loss: 0.6976 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6962 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6954 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6952 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6969 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6954 +INFO:master_logger:Epoch[068/800], Step[0200/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6964 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6963 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6970 +INFO:local_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6958 +INFO:master_logger:Epoch[068/800], Step[0300/0626], Avg Loss: 0.6959 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6963 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6962 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6956 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6959 +INFO:master_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6959 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[068/800], Step[0400/0626], Avg Loss: 0.6955 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6962 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6954 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6965 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6959 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6957 +INFO:master_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[068/800], Step[0500/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6957 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6957 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6962 +INFO:local_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6960 +INFO:master_logger:Epoch[068/800], Step[0600/0626], Avg Loss: 0.6957 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6946, time: 870.06 +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6959, time: 870.67 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6958, time: 871.34 +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6961, time: 870.65 +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6958, time: 870.67 +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6961, time: 870.74 +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6960, time: 870.73 +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:----- Epoch[068/800], Train Loss: 0.6957, time: 867.42 +INFO:master_logger:----- Epoch[068/800], Train Loss: 0.6957, time: 867.42 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-68-Loss-0.6956601794348751.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-68-Loss-0.6956601794348751.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-68-Loss-0.6956601794348751.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-68-Loss-0.6956601794348751.pdopt +INFO:local_logger:Now training epoch 69. LR=0.000151 +INFO:master_logger:Now training epoch 69. LR=0.000151 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6951 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6998 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6932 +INFO:master_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.7177 +INFO:local_logger:Epoch[069/800], Step[0000/0626], Avg Loss: 0.6914 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6957 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6964 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6955 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6955 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6959 +INFO:master_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6956 +INFO:local_logger:Epoch[069/800], Step[0100/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6954 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6949 +INFO:master_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[069/800], Step[0200/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6953 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6944 +INFO:master_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6951 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[069/800], Step[0300/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6952 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6953 +INFO:master_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6952 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[069/800], Step[0400/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6956 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6954 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6955 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6950 +INFO:master_logger:Epoch[069/800], Step[0500/0626], Avg Loss: 0.6951 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6957 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6947 +INFO:master_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6953 +INFO:local_logger:Epoch[069/800], Step[0600/0626], Avg Loss: 0.6953 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6957, time: 877.32 +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6947, time: 876.82 +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6954, time: 877.40 +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6947, time: 877.36 +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6953, time: 877.48 +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6948, time: 877.49 +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6945, time: 877.44 +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:----- Epoch[069/800], Train Loss: 0.6950, time: 873.74 +INFO:master_logger:----- Epoch[069/800], Train Loss: 0.6950, time: 873.74 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-69-Loss-0.6949500013049544.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-69-Loss-0.6949500013049544.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-69-Loss-0.6949500013049544.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-69-Loss-0.6949500013049544.pdopt +INFO:local_logger:Now training epoch 70. LR=0.000151 +INFO:master_logger:Now training epoch 70. LR=0.000151 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.6983 +INFO:master_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.6937 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.7030 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.7050 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.6782 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.6987 +INFO:local_logger:Epoch[070/800], Step[0000/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6955 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6946 +INFO:master_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6943 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6940 +INFO:local_logger:Epoch[070/800], Step[0100/0626], Avg Loss: 0.6951 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6956 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6952 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6944 +INFO:master_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6951 +INFO:local_logger:Epoch[070/800], Step[0200/0626], Avg Loss: 0.6958 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6952 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6946 +INFO:master_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6959 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6953 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[070/800], Step[0300/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6954 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6943 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6951 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6942 +INFO:local_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6949 +INFO:master_logger:Epoch[070/800], Step[0400/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6942 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6954 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6941 +INFO:master_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[070/800], Step[0500/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6942 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6941 +INFO:master_logger:Epoch[070/800], Step[0600/0626], Avg Loss: 0.6945 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6942, time: 864.91 +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6942, time: 865.61 +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6941, time: 865.52 +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6949, time: 864.93 +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6946, time: 861.12 +INFO:master_logger:----- Epoch[070/800], Train Loss: 0.6945, time: 861.12 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6945, time: 864.92 +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6947, time: 864.88 +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:----- Epoch[070/800], Train Loss: 0.6948, time: 864.89 +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-70-Loss-0.6946037372341894.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-70-Loss-0.6946037372341894.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-70-Loss-0.6946037372341894.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-70-Loss-0.6946037372341894.pdopt +INFO:local_logger:Now training epoch 71. LR=0.000151 +INFO:master_logger:Now training epoch 71. LR=0.000151 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.6850 +INFO:master_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.6757 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.7043 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.7002 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.6977 +INFO:local_logger:Epoch[071/800], Step[0000/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6950 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6932 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6937 +INFO:master_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[071/800], Step[0100/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6943 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6942 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6935 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6942 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6931 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6936 +INFO:master_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0200/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6937 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6944 +INFO:master_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6937 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[071/800], Step[0300/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6946 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6931 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6940 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6931 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6940 +INFO:master_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0400/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6945 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6937 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6944 +INFO:master_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[071/800], Step[0500/0626], Avg Loss: 0.6932 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6937 +INFO:master_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[071/800], Step[0600/0626], Avg Loss: 0.6943 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6943, time: 885.74 +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6939, time: 886.70 +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6940, time: 886.74 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6935, time: 886.79 +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6938, time: 886.70 +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6934, time: 886.76 +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6937, time: 882.95 +INFO:master_logger:----- Epoch[071/800], Train Loss: 0.6939, time: 882.95 +INFO:local_logger:----- Epoch[071/800], Train Loss: 0.6945, time: 886.78 +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-71-Loss-0.6937192819392223.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-71-Loss-0.6937192819392223.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-71-Loss-0.6937192819392223.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-71-Loss-0.6937192819392223.pdopt +INFO:local_logger:Now training epoch 72. LR=0.000152 +INFO:master_logger:Now training epoch 72. LR=0.000152 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.6943 +INFO:master_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.6974 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.7013 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.7045 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.6954 +INFO:local_logger:Epoch[072/800], Step[0000/0626], Avg Loss: 0.7056 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6944 +INFO:master_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[072/800], Step[0100/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6937 +INFO:master_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6935 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[072/800], Step[0200/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6932 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6935 +INFO:master_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6942 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6931 +INFO:local_logger:Epoch[072/800], Step[0300/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6931 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6930 +INFO:master_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[072/800], Step[0400/0626], Avg Loss: 0.6926 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6937 +INFO:master_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[072/800], Step[0500/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6937 +INFO:master_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[072/800], Step[0600/0626], Avg Loss: 0.6937 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6928, time: 870.67 +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6930, time: 869.72 +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6936, time: 869.72 +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6930, time: 869.77 +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6934, time: 870.14 +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6936, time: 870.14 +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6936, time: 866.43 +INFO:local_logger:----- Epoch[072/800], Train Loss: 0.6929, time: 870.14 +INFO:master_logger:----- Epoch[072/800], Train Loss: 0.6932, time: 866.43 +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-72-Loss-0.693554089917601.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-72-Loss-0.693554089917601.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-72-Loss-0.693554089917601.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-72-Loss-0.693554089917601.pdopt +INFO:local_logger:Now training epoch 73. LR=0.000152 +INFO:master_logger:Now training epoch 73. LR=0.000152 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6943 +INFO:master_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6940 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6941 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6980 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6944 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[073/800], Step[0000/0626], Avg Loss: 0.7038 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6935 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6940 +INFO:master_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[073/800], Step[0100/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6925 +INFO:master_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[073/800], Step[0200/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6932 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6937 +INFO:master_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[073/800], Step[0300/0626], Avg Loss: 0.6926 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6923 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6917 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6933 +INFO:master_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[073/800], Step[0400/0626], Avg Loss: 0.6926 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6923 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6931 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6929 +INFO:master_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6926 +INFO:local_logger:Epoch[073/800], Step[0500/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6928 +INFO:master_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[073/800], Step[0600/0626], Avg Loss: 0.6929 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6926, time: 884.48 +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6921, time: 884.48 +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6928, time: 884.08 +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6926, time: 880.49 +INFO:master_logger:----- Epoch[073/800], Train Loss: 0.6925, time: 880.49 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6917, time: 884.29 +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6920, time: 884.32 +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6929, time: 884.79 +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:----- Epoch[073/800], Train Loss: 0.6930, time: 884.73 +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-73-Loss-0.6925669066549239.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-73-Loss-0.6925669066549239.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-73-Loss-0.6925669066549239.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-73-Loss-0.6925669066549239.pdopt +INFO:local_logger:Now training epoch 74. LR=0.000152 +INFO:master_logger:Now training epoch 74. LR=0.000152 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.6894 +INFO:master_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.7013 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.6899 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.6982 +INFO:local_logger:Epoch[074/800], Step[0000/0626], Avg Loss: 0.7004 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6923 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6927 +INFO:master_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6914 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0100/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6917 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6916 +INFO:master_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[074/800], Step[0200/0626], Avg Loss: 0.6927 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6929 +INFO:master_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6923 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[074/800], Step[0300/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6926 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6920 +INFO:master_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[074/800], Step[0400/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6923 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6920 +INFO:master_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[074/800], Step[0500/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6923 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6917 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6917 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6922 +INFO:master_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[074/800], Step[0600/0626], Avg Loss: 0.6922 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6922, time: 868.83 +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6922, time: 869.13 +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6916, time: 868.88 +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6920, time: 868.87 +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6917, time: 869.16 +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6917, time: 868.97 +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6918, time: 865.36 +INFO:master_logger:----- Epoch[074/800], Train Loss: 0.6919, time: 865.36 +INFO:local_logger:----- Epoch[074/800], Train Loss: 0.6922, time: 869.28 +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-74-Loss-0.6918195025700205.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-74-Loss-0.6918195025700205.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-74-Loss-0.6918195025700205.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-74-Loss-0.6918195025700205.pdopt +INFO:local_logger:Now training epoch 75. LR=0.000152 +INFO:master_logger:Now training epoch 75. LR=0.000152 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6855 +INFO:master_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6909 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6973 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6943 +INFO:local_logger:Epoch[075/800], Step[0000/0626], Avg Loss: 0.6994 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6910 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6919 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6928 +INFO:master_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6914 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[075/800], Step[0100/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6914 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6911 +INFO:master_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[075/800], Step[0200/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6913 +INFO:master_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[075/800], Step[0300/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6910 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6917 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6910 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6919 +INFO:master_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6914 +INFO:local_logger:Epoch[075/800], Step[0400/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6917 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6913 +INFO:master_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[075/800], Step[0500/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6916 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6910 +INFO:master_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[075/800], Step[0600/0626], Avg Loss: 0.6913 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6918, time: 889.31 +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6917, time: 889.53 +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6907, time: 889.63 +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6910, time: 885.80 +INFO:master_logger:----- Epoch[075/800], Train Loss: 0.6913, time: 885.80 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6913, time: 889.63 +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6912, time: 890.10 +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6916, time: 890.11 +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:----- Epoch[075/800], Train Loss: 0.6911, time: 890.09 +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-75-Loss-0.6910110246189355.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-75-Loss-0.6910110246189355.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-75-Loss-0.6910110246189355.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-75-Loss-0.6910110246189355.pdopt +INFO:local_logger:Now training epoch 76. LR=0.000152 +INFO:master_logger:Now training epoch 76. LR=0.000152 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6949 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6967 +INFO:master_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6921 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6918 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[076/800], Step[0000/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6912 +INFO:master_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0100/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6914 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6909 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6909 +INFO:local_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6905 +INFO:master_logger:Epoch[076/800], Step[0200/0626], Avg Loss: 0.6909 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6906 +INFO:master_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[076/800], Step[0300/0626], Avg Loss: 0.6910 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6901 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6908 +INFO:master_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[076/800], Step[0400/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6903 +INFO:master_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0500/0626], Avg Loss: 0.6910 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6902 +INFO:master_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6910 +INFO:local_logger:Epoch[076/800], Step[0600/0626], Avg Loss: 0.6903 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6911, time: 859.45 +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6906, time: 855.76 +INFO:master_logger:----- Epoch[076/800], Train Loss: 0.6906, time: 855.76 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6910, time: 859.92 +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6907, time: 859.58 +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6906, time: 859.59 +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6903, time: 860.18 +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6903, time: 860.24 +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:----- Epoch[076/800], Train Loss: 0.6904, time: 859.60 +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-76-Loss-0.6905609986769955.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-76-Loss-0.6905609986769955.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-76-Loss-0.6905609986769955.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-76-Loss-0.6905609986769955.pdopt +INFO:local_logger:Now training epoch 77. LR=0.000152 +INFO:master_logger:Now training epoch 77. LR=0.000152 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.6953 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.6740 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.6827 +INFO:master_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.7074 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.7042 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.6947 +INFO:local_logger:Epoch[077/800], Step[0000/0626], Avg Loss: 0.6790 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6899 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6911 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6909 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6900 +INFO:master_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[077/800], Step[0100/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6901 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6907 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6892 +INFO:master_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0200/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6908 +INFO:master_logger:Epoch[077/800], Step[0300/0626], Avg Loss: 0.6901 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6902 +INFO:master_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[077/800], Step[0400/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6899 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6896 +INFO:master_logger:Epoch[077/800], Step[0500/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6906 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6899 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6898 +INFO:master_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[077/800], Step[0600/0626], Avg Loss: 0.6904 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6899, time: 882.70 +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6898, time: 884.37 +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6901, time: 883.82 +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6906, time: 883.80 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6902, time: 883.80 +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6905, time: 883.93 +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6898, time: 880.45 +INFO:master_logger:----- Epoch[077/800], Train Loss: 0.6902, time: 880.45 +INFO:local_logger:----- Epoch[077/800], Train Loss: 0.6905, time: 883.84 +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-77-Loss-0.6897522572932259.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-77-Loss-0.6897522572932259.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-77-Loss-0.6897522572932259.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-77-Loss-0.6897522572932259.pdopt +INFO:local_logger:Now training epoch 78. LR=0.000152 +INFO:master_logger:Now training epoch 78. LR=0.000152 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6877 +INFO:master_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6979 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[078/800], Step[0000/0626], Avg Loss: 0.6743 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6884 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6884 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6902 +INFO:master_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0100/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6895 +INFO:master_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6899 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0200/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6896 +INFO:master_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6903 +INFO:local_logger:Epoch[078/800], Step[0300/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6901 +INFO:master_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[078/800], Step[0400/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6898 +INFO:master_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0500/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6897 +INFO:master_logger:Epoch[078/800], Step[0600/0626], Avg Loss: 0.6896 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6895, time: 850.97 +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6897, time: 849.85 +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6897, time: 850.45 +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6894, time: 850.45 +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6895, time: 846.72 +INFO:master_logger:----- Epoch[078/800], Train Loss: 0.6896, time: 846.72 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6897, time: 850.44 +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6894, time: 850.46 +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:----- Epoch[078/800], Train Loss: 0.6898, time: 850.47 +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-78-Loss-0.6895287958086865.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-78-Loss-0.6895287958086865.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-78-Loss-0.6895287958086865.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-78-Loss-0.6895287958086865.pdopt +INFO:local_logger:Now training epoch 79. LR=0.000152 +INFO:master_logger:Now training epoch 79. LR=0.000152 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6870 +INFO:master_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.7025 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6939 +INFO:local_logger:Epoch[079/800], Step[0000/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6901 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6892 +INFO:master_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6902 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[079/800], Step[0100/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6900 +INFO:master_logger:Epoch[079/800], Step[0200/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6894 +INFO:master_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[079/800], Step[0300/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6891 +INFO:master_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[079/800], Step[0400/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6895 +INFO:local_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6888 +INFO:master_logger:Epoch[079/800], Step[0500/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6892 +INFO:master_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6894 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6896 +INFO:local_logger:Epoch[079/800], Step[0600/0626], Avg Loss: 0.6891 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6892, time: 888.30 +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6889, time: 888.13 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6891, time: 888.74 +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6896, time: 888.14 +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6886, time: 884.16 +INFO:master_logger:----- Epoch[079/800], Train Loss: 0.6892, time: 884.16 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6893, time: 888.24 +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6896, time: 888.24 +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:----- Epoch[079/800], Train Loss: 0.6893, time: 888.26 +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-79-Loss-0.6886302635034396.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-79-Loss-0.6886302635034396.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-79-Loss-0.6886302635034396.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-79-Loss-0.6886302635034396.pdopt +INFO:local_logger:Now training epoch 80. LR=0.000152 +INFO:master_logger:Now training epoch 80. LR=0.000152 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6859 +INFO:master_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.7038 +INFO:local_logger:Epoch[080/800], Step[0000/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6877 +INFO:master_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[080/800], Step[0100/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6886 +INFO:master_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[080/800], Step[0200/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6887 +INFO:master_logger:Epoch[080/800], Step[0300/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6886 +INFO:master_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[080/800], Step[0400/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6888 +INFO:master_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[080/800], Step[0500/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6888 +INFO:master_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6889 +INFO:local_logger:Epoch[080/800], Step[0600/0626], Avg Loss: 0.6890 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6890, time: 849.18 +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6888, time: 849.16 +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6890, time: 849.36 +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6887, time: 849.41 +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6888, time: 849.87 +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6889, time: 849.31 +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6888, time: 845.47 +INFO:master_logger:----- Epoch[080/800], Train Loss: 0.6889, time: 845.47 +INFO:local_logger:----- Epoch[080/800], Train Loss: 0.6891, time: 849.50 +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-80-Loss-0.6887507163269282.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-80-Loss-0.6887507163269282.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-80-Loss-0.6887507163269282.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-80-Loss-0.6887507163269282.pdopt +INFO:local_logger:Now training epoch 81. LR=0.000153 +INFO:master_logger:Now training epoch 81. LR=0.000153 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6999 +INFO:master_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6984 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.7003 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[081/800], Step[0000/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6890 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6882 +INFO:master_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[081/800], Step[0100/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6892 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6886 +INFO:master_logger:Epoch[081/800], Step[0200/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6881 +INFO:master_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[081/800], Step[0300/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6886 +INFO:master_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[081/800], Step[0400/0626], Avg Loss: 0.6888 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6884 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6878 +INFO:master_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[081/800], Step[0500/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6886 +INFO:master_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[081/800], Step[0600/0626], Avg Loss: 0.6885 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6878, time: 886.96 +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6882, time: 887.25 +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6882, time: 887.32 +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6883, time: 887.55 +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6878, time: 887.45 +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6883, time: 887.60 +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6880, time: 887.62 +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:----- Epoch[081/800], Train Loss: 0.6885, time: 883.70 +INFO:master_logger:----- Epoch[081/800], Train Loss: 0.6882, time: 883.70 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-81-Loss-0.6885438528329545.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-81-Loss-0.6885438528329545.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-81-Loss-0.6885438528329545.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-81-Loss-0.6885438528329545.pdopt +INFO:local_logger:Now training epoch 82. LR=0.000153 +INFO:master_logger:Now training epoch 82. LR=0.000153 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6723 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6813 +INFO:master_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6948 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6790 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6783 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.7052 +INFO:local_logger:Epoch[082/800], Step[0000/0626], Avg Loss: 0.6743 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6875 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6874 +INFO:master_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6871 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[082/800], Step[0100/0626], Avg Loss: 0.6882 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6875 +INFO:master_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0200/0626], Avg Loss: 0.6884 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6882 +INFO:master_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[082/800], Step[0300/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6871 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6879 +INFO:master_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0400/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6872 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6874 +INFO:master_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[082/800], Step[0500/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6873 +INFO:master_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6871 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[082/800], Step[0600/0626], Avg Loss: 0.6880 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6876, time: 851.19 +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6870, time: 851.29 +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6876, time: 851.29 +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6881, time: 851.15 +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6881, time: 851.69 +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6873, time: 851.08 +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6877, time: 847.31 +INFO:master_logger:----- Epoch[082/800], Train Loss: 0.6876, time: 847.31 +INFO:local_logger:----- Epoch[082/800], Train Loss: 0.6879, time: 851.21 +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-82-Loss-0.687688001142508.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-82-Loss-0.687688001142508.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-82-Loss-0.687688001142508.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-82-Loss-0.687688001142508.pdopt +INFO:local_logger:Now training epoch 83. LR=0.000153 +INFO:master_logger:Now training epoch 83. LR=0.000153 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6878 +INFO:master_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6793 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6913 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6781 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.7000 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6871 +INFO:local_logger:Epoch[083/800], Step[0000/0626], Avg Loss: 0.6893 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6891 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6875 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6882 +INFO:master_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[083/800], Step[0100/0626], Avg Loss: 0.6872 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6871 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6890 +INFO:master_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[083/800], Step[0200/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6877 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6871 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6872 +INFO:master_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[083/800], Step[0300/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6879 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6873 +INFO:master_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6867 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[083/800], Step[0400/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6875 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6866 +INFO:master_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6872 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[083/800], Step[0500/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6872 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6871 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6875 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6866 +INFO:master_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6872 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[083/800], Step[0600/0626], Avg Loss: 0.6867 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6872, time: 894.39 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6868, time: 894.42 +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6876, time: 894.42 +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6871, time: 890.84 +INFO:master_logger:----- Epoch[083/800], Train Loss: 0.6872, time: 890.84 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6876, time: 894.60 +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6873, time: 894.67 +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6875, time: 894.68 +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:----- Epoch[083/800], Train Loss: 0.6866, time: 894.74 +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-83-Loss-0.6870564654976226.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-83-Loss-0.6870564654976226.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-83-Loss-0.6870564654976226.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-83-Loss-0.6870564654976226.pdopt +INFO:local_logger:Now training epoch 84. LR=0.000153 +INFO:master_logger:Now training epoch 84. LR=0.000153 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6927 +INFO:master_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6881 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6745 +INFO:local_logger:Epoch[084/800], Step[0000/0626], Avg Loss: 0.6914 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6875 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6880 +INFO:master_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6867 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[084/800], Step[0100/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6867 +INFO:master_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6884 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[084/800], Step[0200/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6867 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6878 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6867 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6868 +INFO:master_logger:Epoch[084/800], Step[0300/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6867 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6875 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6876 +INFO:master_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[084/800], Step[0400/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6866 +INFO:master_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[084/800], Step[0500/0626], Avg Loss: 0.6867 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6875 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6867 +INFO:master_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[084/800], Step[0600/0626], Avg Loss: 0.6865 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6870, time: 854.61 +INFO:master_logger:----- Epoch[084/800], Train Loss: 0.6869, time: 854.61 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6868, time: 858.62 +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6865, time: 858.88 +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6866, time: 858.64 +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6874, time: 858.90 +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6868, time: 858.67 +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6875, time: 858.57 +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:----- Epoch[084/800], Train Loss: 0.6864, time: 858.93 +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-84-Loss-0.6870198997374206.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-84-Loss-0.6870198997374206.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-84-Loss-0.6870198997374206.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-84-Loss-0.6870198997374206.pdopt +INFO:local_logger:Now training epoch 85. LR=0.000153 +INFO:master_logger:Now training epoch 85. LR=0.000153 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6915 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6830 +INFO:master_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6898 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6935 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6929 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[085/800], Step[0000/0626], Avg Loss: 0.6803 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6843 +INFO:master_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[085/800], Step[0100/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6865 +INFO:master_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[085/800], Step[0200/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6867 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6866 +INFO:master_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[085/800], Step[0300/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6869 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6866 +INFO:master_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[085/800], Step[0400/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6860 +INFO:master_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[085/800], Step[0500/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6859 +INFO:master_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[085/800], Step[0600/0626], Avg Loss: 0.6862 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6862, time: 892.65 +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6862, time: 893.08 +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6859, time: 890.01 +INFO:master_logger:----- Epoch[085/800], Train Loss: 0.6862, time: 890.01 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6864, time: 893.68 +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6857, time: 893.70 +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6863, time: 893.73 +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6867, time: 893.72 +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:----- Epoch[085/800], Train Loss: 0.6864, time: 893.76 +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-85-Loss-0.6859439270970955.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-85-Loss-0.6859439270970955.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-85-Loss-0.6859439270970955.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-85-Loss-0.6859439270970955.pdopt +INFO:local_logger:Now training epoch 86. LR=0.000153 +INFO:master_logger:Now training epoch 86. LR=0.000153 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.7015 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.7112 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.6865 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.6773 +INFO:master_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.6887 +INFO:local_logger:Epoch[086/800], Step[0000/0626], Avg Loss: 0.6796 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6876 +INFO:master_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[086/800], Step[0100/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6862 +INFO:master_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[086/800], Step[0200/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6860 +INFO:master_logger:Epoch[086/800], Step[0300/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6862 +INFO:master_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[086/800], Step[0400/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6862 +INFO:master_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[086/800], Step[0500/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6858 +INFO:master_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[086/800], Step[0600/0626], Avg Loss: 0.6858 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6860, time: 859.95 +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6859, time: 860.07 +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6862, time: 856.92 +INFO:master_logger:----- Epoch[086/800], Train Loss: 0.6860, time: 856.92 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6860, time: 861.39 +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6862, time: 860.97 +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6857, time: 860.38 +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6858, time: 860.35 +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:----- Epoch[086/800], Train Loss: 0.6859, time: 860.35 +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-86-Loss-0.6862062182739747.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-86-Loss-0.6862062182739747.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-86-Loss-0.6862062182739747.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-86-Loss-0.6862062182739747.pdopt +INFO:local_logger:Now training epoch 87. LR=0.000153 +INFO:master_logger:Now training epoch 87. LR=0.000153 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6883 +INFO:master_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6722 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6880 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6993 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6908 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[087/800], Step[0000/0626], Avg Loss: 0.6712 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6852 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6868 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6862 +INFO:master_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0100/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6864 +INFO:master_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[087/800], Step[0200/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6860 +INFO:master_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0300/0626], Avg Loss: 0.6864 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6852 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6862 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6858 +INFO:master_logger:Epoch[087/800], Step[0400/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6852 +INFO:master_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[087/800], Step[0500/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6858 +INFO:master_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[087/800], Step[0600/0626], Avg Loss: 0.6852 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6853, time: 895.08 +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6855, time: 895.09 +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6860, time: 895.73 +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6852, time: 896.01 +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6856, time: 895.77 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6857, time: 896.11 +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6857, time: 892.15 +INFO:master_logger:----- Epoch[087/800], Train Loss: 0.6856, time: 892.15 +INFO:local_logger:----- Epoch[087/800], Train Loss: 0.6855, time: 895.77 +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-87-Loss-0.6857299549603768.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-87-Loss-0.6857299549603768.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-87-Loss-0.6857299549603768.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-87-Loss-0.6857299549603768.pdopt +INFO:local_logger:Now training epoch 88. LR=0.000153 +INFO:master_logger:Now training epoch 88. LR=0.000153 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6931 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6886 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6765 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6795 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6839 +INFO:master_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[088/800], Step[0000/0626], Avg Loss: 0.6800 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6876 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6868 +INFO:master_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[088/800], Step[0100/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6863 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6852 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6860 +INFO:master_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[088/800], Step[0200/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6861 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6859 +INFO:master_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0300/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6859 +INFO:master_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[088/800], Step[0400/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6857 +INFO:master_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[088/800], Step[0500/0626], Avg Loss: 0.6857 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6854 +INFO:master_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[088/800], Step[0600/0626], Avg Loss: 0.6853 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6855, time: 854.47 +INFO:master_logger:----- Epoch[088/800], Train Loss: 0.6853, time: 854.47 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6854, time: 859.85 +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6849, time: 859.24 +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6852, time: 859.32 +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6854, time: 859.33 +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6853, time: 859.35 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6853, time: 859.33 +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:----- Epoch[088/800], Train Loss: 0.6856, time: 860.00 +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-88-Loss-0.6854734038612285.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-88-Loss-0.6854734038612285.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-88-Loss-0.6854734038612285.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-88-Loss-0.6854734038612285.pdopt +INFO:local_logger:Now training epoch 89. LR=0.000154 +INFO:master_logger:Now training epoch 89. LR=0.000154 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6780 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6912 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6884 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6960 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6791 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6747 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6772 +INFO:local_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6825 +INFO:master_logger:Epoch[089/800], Step[0000/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6850 +INFO:master_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6848 +INFO:local_logger:Epoch[089/800], Step[0100/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6851 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6852 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6844 +INFO:master_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6852 +INFO:local_logger:Epoch[089/800], Step[0200/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6854 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6852 +INFO:local_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6847 +INFO:master_logger:Epoch[089/800], Step[0300/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6854 +INFO:master_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6851 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6856 +INFO:local_logger:Epoch[089/800], Step[0400/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6851 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6848 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6849 +INFO:master_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0500/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6845 +INFO:master_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6848 +INFO:local_logger:Epoch[089/800], Step[0600/0626], Avg Loss: 0.6849 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6845, time: 884.60 +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6849, time: 885.46 +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6846, time: 882.71 +INFO:master_logger:----- Epoch[089/800], Train Loss: 0.6848, time: 882.71 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6850, time: 885.49 +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6845, time: 885.58 +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6848, time: 885.61 +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6849, time: 885.54 +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:----- Epoch[089/800], Train Loss: 0.6849, time: 885.50 +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-89-Loss-0.6845652029200572.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-89-Loss-0.6845652029200572.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-89-Loss-0.6845652029200572.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-89-Loss-0.6845652029200572.pdopt +INFO:local_logger:Now training epoch 90. LR=0.000154 +INFO:master_logger:Now training epoch 90. LR=0.000154 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6924 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6899 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6763 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6748 +INFO:master_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6787 +INFO:local_logger:Epoch[090/800], Step[0000/0626], Avg Loss: 0.6883 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6851 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6853 +INFO:master_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[090/800], Step[0100/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6850 +INFO:master_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[090/800], Step[0200/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6851 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6848 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6841 +INFO:master_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[090/800], Step[0300/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6845 +INFO:master_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[090/800], Step[0400/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6842 +INFO:master_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[090/800], Step[0500/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6842 +INFO:master_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[090/800], Step[0600/0626], Avg Loss: 0.6845 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6845, time: 851.60 +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6839, time: 851.05 +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6844, time: 851.17 +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6844, time: 851.26 +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6847, time: 851.26 +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6844, time: 851.23 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6841, time: 847.55 +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:master_logger:----- Epoch[090/800], Train Loss: 0.6843, time: 847.55 +INFO:local_logger:----- Epoch[090/800], Train Loss: 0.6840, time: 851.24 +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-90-Loss-0.6841203801495528.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-90-Loss-0.6841203801495528.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-90-Loss-0.6841203801495528.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-90-Loss-0.6841203801495528.pdopt +INFO:local_logger:Now training epoch 91. LR=0.000154 +INFO:master_logger:Now training epoch 91. LR=0.000154 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6872 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6756 +INFO:master_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6859 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6748 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6793 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6961 +INFO:local_logger:Epoch[091/800], Step[0000/0626], Avg Loss: 0.6897 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6833 +INFO:master_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[091/800], Step[0100/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6837 +INFO:master_logger:Epoch[091/800], Step[0200/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6834 +INFO:master_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[091/800], Step[0300/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6847 +INFO:master_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[091/800], Step[0400/0626], Avg Loss: 0.6848 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6840 +INFO:master_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[091/800], Step[0500/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6842 +INFO:master_logger:Epoch[091/800], Step[0600/0626], Avg Loss: 0.6842 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6843, time: 886.91 +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6838, time: 886.90 +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6848, time: 887.37 +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6844, time: 887.49 +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6842, time: 887.83 +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6839, time: 887.30 +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6839, time: 883.60 +INFO:master_logger:----- Epoch[091/800], Train Loss: 0.6841, time: 883.60 +INFO:local_logger:----- Epoch[091/800], Train Loss: 0.6838, time: 887.32 +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-91-Loss-0.6838902651592956.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-91-Loss-0.6838902651592956.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-91-Loss-0.6838902651592956.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-91-Loss-0.6838902651592956.pdopt +INFO:local_logger:Now training epoch 92. LR=0.000154 +INFO:master_logger:Now training epoch 92. LR=0.000154 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6957 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6676 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6676 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6866 +INFO:master_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6936 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[092/800], Step[0000/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6840 +INFO:master_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[092/800], Step[0100/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6831 +INFO:master_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[092/800], Step[0200/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6839 +INFO:master_logger:Epoch[092/800], Step[0300/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6841 +INFO:master_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0400/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6840 +INFO:master_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[092/800], Step[0500/0626], Avg Loss: 0.6842 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6839 +INFO:master_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[092/800], Step[0600/0626], Avg Loss: 0.6836 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6839, time: 858.55 +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6839, time: 858.56 +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6836, time: 858.97 +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6833, time: 858.75 +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6837, time: 858.76 +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6835, time: 858.85 +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6841, time: 855.23 +INFO:master_logger:----- Epoch[092/800], Train Loss: 0.6837, time: 855.23 +INFO:local_logger:----- Epoch[092/800], Train Loss: 0.6836, time: 859.34 +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-92-Loss-0.6840875268930822.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-92-Loss-0.6840875268930822.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-92-Loss-0.6840875268930822.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-92-Loss-0.6840875268930822.pdopt +INFO:local_logger:Now training epoch 93. LR=0.000154 +INFO:master_logger:Now training epoch 93. LR=0.000154 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6662 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6933 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6938 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6859 +INFO:master_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6846 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6770 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6934 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[093/800], Step[0000/0626], Avg Loss: 0.6873 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6848 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6832 +INFO:master_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0100/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6835 +INFO:master_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[093/800], Step[0200/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6831 +INFO:master_logger:Epoch[093/800], Step[0300/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6830 +INFO:master_logger:Epoch[093/800], Step[0400/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6830 +INFO:master_logger:Epoch[093/800], Step[0500/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6827 +INFO:master_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[093/800], Step[0600/0626], Avg Loss: 0.6835 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6837, time: 883.02 +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6832, time: 882.93 +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6827, time: 879.60 +INFO:master_logger:----- Epoch[093/800], Train Loss: 0.6834, time: 879.60 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6832, time: 883.71 +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6834, time: 883.93 +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6837, time: 883.92 +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6836, time: 884.00 +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:----- Epoch[093/800], Train Loss: 0.6834, time: 883.71 +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-93-Loss-0.6827037774682657.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-93-Loss-0.6827037774682657.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-93-Loss-0.6827037774682657.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-93-Loss-0.6827037774682657.pdopt +INFO:local_logger:Now training epoch 94. LR=0.000154 +INFO:master_logger:Now training epoch 94. LR=0.000154 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6799 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6772 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6722 +INFO:master_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6904 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6963 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6866 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[094/800], Step[0000/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6834 +INFO:master_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[094/800], Step[0100/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6847 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6828 +INFO:master_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[094/800], Step[0200/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6849 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6828 +INFO:master_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[094/800], Step[0300/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6845 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6831 +INFO:master_logger:Epoch[094/800], Step[0400/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6832 +INFO:master_logger:Epoch[094/800], Step[0500/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6840 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6828 +INFO:master_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[094/800], Step[0600/0626], Avg Loss: 0.6825 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6835, time: 860.64 +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6829, time: 861.10 +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6834, time: 861.04 +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6827, time: 861.88 +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6838, time: 861.14 +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6830, time: 857.69 +INFO:master_logger:----- Epoch[094/800], Train Loss: 0.6831, time: 857.69 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6825, time: 861.06 +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:----- Epoch[094/800], Train Loss: 0.6830, time: 861.17 +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-94-Loss-0.6830039001247509.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-94-Loss-0.6830039001247509.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-94-Loss-0.6830039001247509.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-94-Loss-0.6830039001247509.pdopt +INFO:local_logger:Now training epoch 95. LR=0.000155 +INFO:master_logger:Now training epoch 95. LR=0.000155 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6832 +INFO:master_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6771 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6798 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6706 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6905 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6843 +INFO:local_logger:Epoch[095/800], Step[0000/0626], Avg Loss: 0.6855 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6826 +INFO:master_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[095/800], Step[0100/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6829 +INFO:master_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0200/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6833 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6820 +INFO:master_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0300/0626], Avg Loss: 0.6835 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6826 +INFO:master_logger:Epoch[095/800], Step[0400/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6832 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6828 +INFO:master_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[095/800], Step[0500/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6825 +INFO:master_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[095/800], Step[0600/0626], Avg Loss: 0.6826 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6831, time: 887.46 +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6830, time: 886.49 +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6825, time: 886.66 +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6825, time: 886.64 +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6829, time: 886.77 +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6826, time: 886.77 +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6826, time: 883.07 +INFO:master_logger:----- Epoch[095/800], Train Loss: 0.6827, time: 883.07 +INFO:local_logger:----- Epoch[095/800], Train Loss: 0.6827, time: 886.80 +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-95-Loss-0.6825694100624208.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-95-Loss-0.6825694100624208.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-95-Loss-0.6825694100624208.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-95-Loss-0.6825694100624208.pdopt +INFO:local_logger:Now training epoch 96. LR=0.000155 +INFO:master_logger:Now training epoch 96. LR=0.000155 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6925 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6757 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6684 +INFO:master_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6799 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6804 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6683 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6801 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6900 +INFO:local_logger:Epoch[096/800], Step[0000/0626], Avg Loss: 0.6837 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6836 +INFO:master_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6841 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[096/800], Step[0100/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6825 +INFO:master_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[096/800], Step[0200/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6819 +INFO:master_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[096/800], Step[0300/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6822 +INFO:master_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[096/800], Step[0400/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6821 +INFO:master_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[096/800], Step[0500/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6827 +INFO:master_logger:Epoch[096/800], Step[0600/0626], Avg Loss: 0.6824 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6821, time: 868.69 +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6821, time: 869.09 +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6823, time: 868.99 +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6826, time: 868.81 +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6824, time: 868.69 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6827, time: 864.96 +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:master_logger:----- Epoch[096/800], Train Loss: 0.6824, time: 864.96 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6828, time: 868.82 +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:local_logger:----- Epoch[096/800], Train Loss: 0.6826, time: 868.66 +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-96-Loss-0.6826821214191926.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-96-Loss-0.6826821214191926.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-96-Loss-0.6826821214191926.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-96-Loss-0.6826821214191926.pdopt +INFO:local_logger:Now training epoch 97. LR=0.000155 +INFO:master_logger:Now training epoch 97. LR=0.000155 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6880 +INFO:master_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6940 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6844 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6755 +INFO:local_logger:Epoch[097/800], Step[0000/0626], Avg Loss: 0.6928 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6822 +INFO:master_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6831 +INFO:local_logger:Epoch[097/800], Step[0100/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:master_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[097/800], Step[0200/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6822 +INFO:master_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[097/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6818 +INFO:master_logger:Epoch[097/800], Step[0400/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6826 +INFO:master_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[097/800], Step[0500/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6825 +INFO:master_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[097/800], Step[0600/0626], Avg Loss: 0.6814 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6822, time: 881.18 +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6818, time: 882.30 +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6822, time: 882.31 +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6826, time: 882.32 +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6813, time: 882.38 +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6820, time: 882.40 +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6818, time: 878.64 +INFO:master_logger:----- Epoch[097/800], Train Loss: 0.6820, time: 878.64 +INFO:local_logger:----- Epoch[097/800], Train Loss: 0.6823, time: 882.40 +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-97-Loss-0.6818455600972856.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-97-Loss-0.6818455600972856.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-97-Loss-0.6818455600972856.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-97-Loss-0.6818455600972856.pdopt +INFO:local_logger:Now training epoch 98. LR=0.000155 +INFO:master_logger:Now training epoch 98. LR=0.000155 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6930 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6821 +INFO:master_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6860 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6848 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6756 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.7011 +INFO:local_logger:Epoch[098/800], Step[0000/0626], Avg Loss: 0.6899 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6824 +INFO:master_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[098/800], Step[0100/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6826 +INFO:master_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[098/800], Step[0200/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6822 +INFO:master_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[098/800], Step[0300/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6827 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6817 +INFO:master_logger:Epoch[098/800], Step[0400/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6823 +INFO:master_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[098/800], Step[0500/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6816 +INFO:master_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[098/800], Step[0600/0626], Avg Loss: 0.6820 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6819, time: 870.20 +INFO:master_logger:----- Epoch[098/800], Train Loss: 0.6820, time: 870.20 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6823, time: 875.16 +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6820, time: 874.01 +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6825, time: 874.00 +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6817, time: 874.10 +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6822, time: 874.02 +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6815, time: 874.10 +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:----- Epoch[098/800], Train Loss: 0.6816, time: 874.02 +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-98-Loss-0.681889634827903.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-98-Loss-0.681889634827903.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-98-Loss-0.681889634827903.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-98-Loss-0.681889634827903.pdopt +INFO:local_logger:Now training epoch 99. LR=0.000155 +INFO:master_logger:Now training epoch 99. LR=0.000155 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6968 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6870 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6865 +INFO:master_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6853 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6660 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6719 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6836 +INFO:local_logger:Epoch[099/800], Step[0000/0626], Avg Loss: 0.6839 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6834 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6826 +INFO:master_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[099/800], Step[0100/0626], Avg Loss: 0.6826 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6828 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6822 +INFO:master_logger:Epoch[099/800], Step[0200/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6811 +INFO:master_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[099/800], Step[0300/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6825 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6816 +INFO:master_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[099/800], Step[0400/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6815 +INFO:master_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[099/800], Step[0500/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6817 +INFO:master_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[099/800], Step[0600/0626], Avg Loss: 0.6817 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6811, time: 873.44 +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6817, time: 874.12 +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6814, time: 874.14 +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6815, time: 874.31 +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6816, time: 874.69 +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6820, time: 874.75 +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6814, time: 871.01 +INFO:master_logger:----- Epoch[099/800], Train Loss: 0.6816, time: 871.01 +INFO:local_logger:----- Epoch[099/800], Train Loss: 0.6820, time: 874.69 +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-99-Loss-0.6813920508235197.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-99-Loss-0.6813920508235197.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-99-Loss-0.6813920508235197.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-99-Loss-0.6813920508235197.pdopt +INFO:local_logger:Now training epoch 100. LR=0.000155 +INFO:master_logger:Now training epoch 100. LR=0.000155 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6874 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6858 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6763 +INFO:master_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6794 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6727 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6920 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6764 +INFO:local_logger:Epoch[100/800], Step[0000/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6824 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6801 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6821 +INFO:master_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[100/800], Step[0100/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6804 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6812 +INFO:master_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[100/800], Step[0200/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6812 +INFO:master_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[100/800], Step[0300/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6815 +INFO:master_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[100/800], Step[0400/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6812 +INFO:master_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[100/800], Step[0500/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6814 +INFO:master_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[100/800], Step[0600/0626], Avg Loss: 0.6814 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6815, time: 871.01 +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6814, time: 870.82 +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6814, time: 871.44 +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6812, time: 871.50 +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6812, time: 867.05 +INFO:master_logger:----- Epoch[100/800], Train Loss: 0.6813, time: 867.05 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6811, time: 872.20 +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6814, time: 870.95 +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:----- Epoch[100/800], Train Loss: 0.6814, time: 871.00 +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-100-Loss-0.6812047341083004.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-100-Loss-0.6812047341083004.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-100-Loss-0.6812047341083004.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-100-Loss-0.6812047341083004.pdopt +INFO:local_logger:Now training epoch 101. LR=0.000156 +INFO:master_logger:Now training epoch 101. LR=0.000156 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6922 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6755 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6834 +INFO:master_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6772 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6966 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6690 +INFO:local_logger:Epoch[101/800], Step[0000/0626], Avg Loss: 0.6717 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6798 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6819 +INFO:master_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[101/800], Step[0100/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6813 +INFO:master_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[101/800], Step[0200/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6807 +INFO:master_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[101/800], Step[0300/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6804 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6810 +INFO:master_logger:Epoch[101/800], Step[0400/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6810 +INFO:master_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0500/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6808 +INFO:master_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[101/800], Step[0600/0626], Avg Loss: 0.6809 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6805, time: 867.69 +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6806, time: 868.18 +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6806, time: 868.58 +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6808, time: 864.53 +INFO:master_logger:----- Epoch[101/800], Train Loss: 0.6808, time: 864.53 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6807, time: 868.26 +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6810, time: 868.29 +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6811, time: 868.42 +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:----- Epoch[101/800], Train Loss: 0.6812, time: 868.25 +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-101-Loss-0.6808065795139766.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-101-Loss-0.6808065795139766.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-101-Loss-0.6808065795139766.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-101-Loss-0.6808065795139766.pdopt +INFO:local_logger:Now training epoch 102. LR=0.000156 +INFO:master_logger:Now training epoch 102. LR=0.000156 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6861 +INFO:master_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6794 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6910 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6753 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6669 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6722 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6901 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6797 +INFO:local_logger:Epoch[102/800], Step[0000/0626], Avg Loss: 0.6743 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6830 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6822 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6813 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6817 +INFO:master_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[102/800], Step[0100/0626], Avg Loss: 0.6804 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6807 +INFO:master_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6819 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[102/800], Step[0200/0626], Avg Loss: 0.6803 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6814 +INFO:master_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6821 +INFO:local_logger:Epoch[102/800], Step[0300/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6816 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6804 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6810 +INFO:master_logger:Epoch[102/800], Step[0400/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6820 +INFO:local_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6811 +INFO:master_logger:Epoch[102/800], Step[0500/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6817 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6810 +INFO:master_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6811 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[102/800], Step[0600/0626], Avg Loss: 0.6813 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6812, time: 872.04 +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6812, time: 868.78 +INFO:master_logger:----- Epoch[102/800], Train Loss: 0.6811, time: 868.78 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6809, time: 872.56 +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6805, time: 872.77 +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6808, time: 873.71 +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6809, time: 873.10 +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6815, time: 873.19 +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:----- Epoch[102/800], Train Loss: 0.6815, time: 873.09 +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-102-Loss-0.681159841605351.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-102-Loss-0.681159841605351.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-102-Loss-0.681159841605351.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-102-Loss-0.681159841605351.pdopt +INFO:local_logger:Now training epoch 103. LR=0.000156 +INFO:master_logger:Now training epoch 103. LR=0.000156 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6787 +INFO:master_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6646 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6794 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6850 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6838 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6909 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6721 +INFO:local_logger:Epoch[103/800], Step[0000/0626], Avg Loss: 0.6917 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6811 +INFO:master_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6829 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6810 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6795 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6823 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6818 +INFO:local_logger:Epoch[103/800], Step[0100/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6799 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6810 +INFO:master_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6815 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6804 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6797 +INFO:local_logger:Epoch[103/800], Step[0200/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6803 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6799 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6803 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6801 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6806 +INFO:master_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[103/800], Step[0300/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6800 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6805 +INFO:master_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6814 +INFO:local_logger:Epoch[103/800], Step[0400/0626], Avg Loss: 0.6803 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6812 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6807 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6808 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6804 +INFO:master_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[103/800], Step[0500/0626], Avg Loss: 0.6804 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6806 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6801 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6802 +INFO:master_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6805 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6809 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6802 +INFO:local_logger:Epoch[103/800], Step[0600/0626], Avg Loss: 0.6806 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6801, time: 859.70 +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6803, time: 859.89 +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6805, time: 859.89 +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6809, time: 856.80 +INFO:master_logger:----- Epoch[103/800], Train Loss: 0.6805, time: 856.80 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6806, time: 860.28 +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6801, time: 859.89 +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6802, time: 859.91 +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:----- Epoch[103/800], Train Loss: 0.6809, time: 860.42 +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-103-Loss-0.6808819352382769.pdparams +INFO:local_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-103-Loss-0.6808819352382769.pdopt +INFO:master_logger:----- Save model: ./output/train-20211219-17-07-40/MAE-Epoch-103-Loss-0.6808819352382769.pdparams +INFO:master_logger:----- Save optim: ./output/train-20211219-17-07-40/MAE-Epoch-103-Loss-0.6808819352382769.pdopt +INFO:local_logger:Now training epoch 104. LR=0.000156 +INFO:master_logger:Now training epoch 104. LR=0.000156 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6583 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6660 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6799 +INFO:master_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6722 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6668 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6885 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6790 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6707 +INFO:local_logger:Epoch[104/800], Step[0000/0626], Avg Loss: 0.6680 + + +-------------------------------------- +C++ Traceback (most recent call last): +-------------------------------------- +0 paddle::platform::GpuMemcpySync(void*, void const*, unsigned long, cudaMemcpyKind) + +---------------------- +Error Message Summary: +---------------------- +FatalError: `Termination signal` is detected by the operating system. + [TimeInfo: *** Aborted at 1639995159 (unix time) try "date -d @1639995159" if you are using GNU date ***] + [SignalInfo: *** SIGTERM (@0x84e5) received by PID 25456 (TID 0x7f771efbe700) from PID 34021 ***] + + + +-------------------------------------- +C++ Traceback (most recent call last): +-------------------------------------- +0 paddle::platform::GpuMemcpySync(void*, void const*, unsigned long, cudaMemcpyKind) + +---------------------- +Error Message Summary: +---------------------- +FatalError: `Termination signal` is detected by the operating system. + [TimeInfo: *** Aborted at 1639995171 (unix time) try "date -d @1639995171" if you are using GNU date ***] + [SignalInfo: *** SIGTERM (@0x84e5) received by PID 25537 (TID 0x7fcf37fc6700) from PID 34021 ***] + +Traceback (most recent call last): + File "main_multi_gpu_pretrain.py", line 416, in + main() + File "main_multi_gpu_pretrain.py", line 412, in main + dist.spawn(main_worker, args=(config, dataset_train, ), nprocs=config.NGPUS) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 502, in spawn + while not context.join(): + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 312, in join + self._throw_exception(error_index) + File "/opt/conda/envs/py36/lib/python3.6/site-packages/paddle/distributed/spawn.py", line 320, in _throw_exception + (error_index, name)) +Exception: Process 7 terminated with signal SIGTERM. +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 14 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 20 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 20 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 20 leaked semaphores to clean up at shutdown + len(cache)) +/opt/conda/envs/py36/lib/python3.6/multiprocessing/semaphore_tracker.py:143: UserWarning: semaphore_tracker: There appear to be 20 leaked semaphores to clean up at shutdown + len(cache)) diff --git a/image_classification/MLP-Mixer/augment.py b/image_classification/MLP-Mixer/augment.py new file mode 100644 index 00000000..7a7f081c --- /dev/null +++ b/image_classification/MLP-Mixer/augment.py @@ -0,0 +1,285 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Augmentation""" +""" Rand Augmentation """ +# reference: RandAugment: Practical automated data augmentation with a reduced search space +# https://arxiv.org/abs/1909.13719 + +""" Auto Augmentation """ +# reference: AutoAugment: Learning Augmentation Policies from Data +# https://arxiv.org/abs/1805.09501 + +import random +import numpy as np +from PIL import Image, ImageEnhance, ImageOps + + +def auto_augment_policy_original(): + """25 types of augment policies in original paper""" + policy = [ + [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy] + return policy + + +def rand_augment_policy_original(magnitude_idx=9): + """ + 14 types of augment policies in original paper + Args: + magnitude_idx: M + """ + policy = [ + ('Posterize', 1, magnitude_idx), ('Rotate', 1, magnitude_idx), + ('Solarize', 1, magnitude_idx), ('AutoContrast', 1, magnitude_idx), + ('Equalize', 1, magnitude_idx), ('Contrast', 1, magnitude_idx), + ('Color', 1, magnitude_idx), ('Invert', 1, magnitude_idx), + ('Sharpness', 1, magnitude_idx), ('Brightness', 1, magnitude_idx), + ('ShearX', 1, magnitude_idx), ('ShearY', 1, magnitude_idx), + ('TranslateX', 1, magnitude_idx), ('TranslateY', 1, magnitude_idx), + ] + policy = [SubPolicy(*args) for args in policy] + return policy + + +class AutoAugment(): + """Auto Augment + Randomly choose a tuple of augment ops from a list of policy + Then apply the tuple of augment ops to input image + + Examples: + policy = auto_augment_policy_original() + augment = AutoAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy): + self.policy = policy + + def __call__(self, image, policy_idx=None): + if policy_idx is None: + policy_idx = random.randint(0, len(self.policy) - 1) + + sub_policy = self.policy[policy_idx] + for op in sub_policy: + image = op(image) + return image + + +class RandAugment(): + """Rand Augment + Randomly choose N augment ops from a list of K policies + Then apply the N ops to input image + + Examples: + policy = rand_augment_policy_original(magnitude_idx) + augment = RandAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy, num_layers=2): + """ + Args: + policy: list of SubPolicy + num_layers: int + """ + self.policy = policy + self.num_layers = num_layers + + def __call__(self, image): + selected_idx = np.random.choice(len(self.policy), self.num_layers) + + for policy_idx in selected_idx: + sub_policy = self.policy[policy_idx] + image = sub_policy(image) + return image + + +class SubPolicy: + """Subpolicy + Read augment name and magnitude, apply augment with probability + Args: + op_name: str, augment operation name + prob: float, if prob > random prob, apply augment + magnitude_idx: int, index of magnitude in preset magnitude ranges + """ + + def __init__(self, op_name, prob, magnitude_idx): + # ranges of operations' magnitude + ranges = { + 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'TranslateX': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'TranslateY': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'Rotate': np.linspace(0, 30, 10), # [-30, 30] (by random negative) + 'Color': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), # [0, 4] + 'Solarize': np.linspace(256, 0, 10), # [0, 256] + 'Contrast': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Sharpness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Brightness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'AutoContrast': [0] * 10, # no range + 'Equalize': [0] * 10, # no range + 'Invert': [0] * 10, # no range + } + + # augmentation operations + # Lambda is not pickleable for DDP + # image_ops = { + # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), + # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), + # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), + # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), + # 'Rotate': lambda image, magnitude: rotate(image, magnitude), + # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), + # 'Invert': lambda image, magnitude: invert(image, magnitude), + # 'Equalize': lambda image, magnitude: equalize(image, magnitude), + # 'Solarize': lambda image, magnitude: solarize(image, magnitude), + # 'Posterize': lambda image, magnitude: posterize(image, magnitude), + # 'Contrast': lambda image, magnitude: contrast(image, magnitude), + # 'Color': lambda image, magnitude: color(image, magnitude), + # 'Brightness': lambda image, magnitude: brightness(image, magnitude), + # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), + # } + image_ops = { + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_relative, + 'TranslateY': translate_y_relative, + 'Rotate': rotate, + 'AutoContrast': auto_contrast, + 'Invert': invert, + 'Equalize': equalize, + 'Solarize': solarize, + 'Posterize': posterize, + 'Contrast': contrast, + 'Color': color, + 'Brightness': brightness, + 'Sharpness': sharpness, + } + + self.prob = prob + self.magnitude = ranges[op_name][magnitude_idx] + self.op = image_ops[op_name] + + def __call__(self, image): + if self.prob > random.random(): + image = self.op(image, self.magnitude) + return image + + +# PIL Image transforms +# https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform +def shear_x(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor) + + +def shear_y(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor) + + +def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor) + + +def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor) + + +def rotate(image, magnitude): + rot = image.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new('RGBA', rot.size, (128,) * 4), + rot).convert(image.mode) + + +def auto_contrast(image, magnitude=None): + return ImageOps.autocontrast(image) + + +def invert(image, magnitude=None): + return ImageOps.invert(image) + + +def equalize(image, magnitude=None): + return ImageOps.equalize(image) + + +def solarize(image, magnitude): + return ImageOps.solarize(image, magnitude) + + +def posterize(image, magnitude): + return ImageOps.posterize(image, magnitude) + + +def contrast(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Contrast(image).enhance(1 + magnitude) + + +def color(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Color(image).enhance(1 + magnitude) + + +def brightness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Brightness(image).enhance(1 + magnitude) + + +def sharpness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Sharpness(image).enhance(1 + magnitude) + diff --git a/image_classification/MLP-Mixer/config.py b/image_classification/MLP-Mixer/config.py index a2903bf9..86a91247 100644 --- a/image_classification/MLP-Mixer/config.py +++ b/image_classification/MLP-Mixer/config.py @@ -45,8 +45,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 -_C.MODEL.DROPOUT = 0.1 -_C.MODEL.DROPPATH = 0.1 +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.ATTENTION_DROPOUT = 0.0 +_C.MODEL.DROP_PATH = 0.1 # transformer settings _C.MODEL.MIXER = CN() @@ -58,13 +59,14 @@ _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 _C.TRAIN.NUM_EPOCHS = 300 -_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size -_C.TRAIN.WEIGHT_DECAY = 0.01 #0.3 # 0.0 for finetune -_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune -_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 -_C.TRAIN.END_LR = 1e-5 -_C.TRAIN.GRAD_CLIP = 1.0 -_C.TRAIN.ACCUM_ITER = 2 #1 +_C.TRAIN.WARMUP_EPOCHS = 20 +_C.TRAIN.WEIGHT_DECAY = 0.05 +_C.TRAIN.BASE_LR = 0.001 +_C.TRAIN.WARMUP_START_LR = 5e-7 +_C.TRAIN.END_LR = 5e-6 +_C.TRAIN.GRAD_CLIP = 5.0 +_C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -78,6 +80,24 @@ _C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) # for adamW _C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 +# train augmentation +_C.TRAIN.MIXUP_ALPHA = 0.8 +_C.TRAIN.CUTMIX_ALPHA = 1.0 +_C.TRAIN.CUTMIX_MINMAX = None +_C.TRAIN.MIXUP_PROB = 1.0 +_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 +_C.TRAIN.MIXUP_MODE = 'batch' + +_C.TRAIN.SMOOTHING = 0.1 +_C.TRAIN.COLOR_JITTER = 0.4 +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False + +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' +_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_SPLIT = False + # misc _C.SAVE = "./output" _C.TAG = "default" @@ -120,6 +140,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/MLP-Mixer/datasets.py b/image_classification/MLP-Mixer/datasets.py index fc3e8bad..304df9a3 100644 --- a/image_classification/MLP-Mixer/datasets.py +++ b/image_classification/MLP-Mixer/datasets.py @@ -19,8 +19,20 @@ import os import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from transforms import RandomHorizontalFlip +from random_erasing import RandomErasing + class ImageNet2012Dataset(Dataset): """Build ImageNet2012 dataset @@ -80,12 +92,36 @@ def get_train_transforms(config): transforms_train: training transforms """ - transforms_train = transforms.Compose([ + aug_op_list = [] + # STEP1: random crop and resize + aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) + scale=(0.05, 1.0), interpolation='bicubic')) + # STEP2: auto_augment or color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # STEP3: other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # STEP4: random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + # Final: compose transforms and return + transforms_train = transforms.Compose(aug_op_list) return transforms_train @@ -105,7 +141,7 @@ def get_val_transforms(config): scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) transforms_val = transforms.Compose([ - transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image + transforms.Resize(scale_size, interpolation='bicubic'), transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), transforms.ToTensor(), transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), @@ -123,6 +159,7 @@ def get_dataset(config, mode='train'): Returns: dataset: dataset object """ + assert mode in ['train', 'val'] if config.DATA.DATASET == "cifar10": if mode == 'train': diff --git a/image_classification/MLP-Mixer/losses.py b/image_classification/MLP-Mixer/losses.py new file mode 100644 index 00000000..082467a3 --- /dev/null +++ b/image_classification/MLP-Mixer/losses.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Implement Loss functions """ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class LabelSmoothingCrossEntropyLoss(nn.Layer): + """ cross entropy loss for label smoothing + Args: + smoothing: float, smoothing rate + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, target label with shape [N] + Return: + loss: float, cross entropy loss value + """ + def __init__(self, smoothing=0.1): + super().__init__() + assert 0 <= smoothing < 1.0 + self.smoothing = smoothing + self.confidence = 1 - smoothing + + def forward(self, x, target): + log_probs = F.log_softmax(x) # [N, num_classes] + # target_index is used to get prob for each of the N samples + target_index = paddle.zeros([x.shape[0], 2], dtype='int64') # [N, 2] + target_index[:, 0] = paddle.arange(x.shape[0]) + target_index[:, 1] = target + + nll_loss = -log_probs.gather_nd(index=target_index) # index: [N] + smooth_loss = -log_probs.mean(axis=-1) + loss = self.confidence * nll_loss + self.smoothing * smooth_loss + return loss.mean() + + +class SoftTargetCrossEntropyLoss(nn.Layer): + """ cross entropy loss for soft target + Args: + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, soft target with shape [N, num_classes] + Returns: + loss: float, the mean loss value + """ + def __init__(self): + super().__init__() + + def forward(self, x, target): + loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1) + return loss.mean() + + +class DistillationLoss(nn.Layer): + """Distillation loss function + This layer includes the orginal loss (criterion) and a extra + distillation loss (criterion), which computes the loss with + different type options, between current model and + a teacher model as its supervision. + + Args: + base_criterion: nn.Layer, the original criterion + teacher_model: nn.Layer, the teacher model as supervision + distillation_type: str, one of ['none', 'soft', 'hard'] + alpha: float, ratio of base loss (* (1-alpha)) + and distillation loss( * alpha) + tao: float, temperature in distillation + """ + def __init__(self, + base_criterion, + teacher_model, + distillation_type, + alpha, + tau): + super().__init__() + assert distillation_type in ['none', 'soft', 'hard'] + self.base_criterion = base_criterion + self.teacher_model = teacher_model + self.type = distillation_type + self.alpha = alpha + self.tau = tau + + def forward(self, inputs, outputs, targets): + """ + Args: + inputs: tensor, the orginal model inputs + outputs: tensor, the outputs of the model + outputds_kd: tensor, the distillation outputs of the model, + this is usually obtained by a separate branch + in the last layer of the model + targets: tensor, the labels for the base criterion + """ + outputs, outputs_kd = outputs[0], outputs[1] + base_loss = self.base_criterion(outputs, targets) + if self.type == 'none': + return base_loss + + with paddle.no_grad(): + teacher_outputs = self.teacher_model(inputs) + + if self.type == 'soft': + distillation_loss = F.kl_div( + F.log_softmax(outputs_kd / self.tau, axis=1), + F.log_softmax(teacher_outputs / self.tau, axis=1), + reduction='sum') * (self.tau * self.tau) / outputs_kd.numel() + elif self.type == 'hard': + distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(axis=1)) + + loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha + return loss + + diff --git a/image_classification/MLP-Mixer/main_multi_gpu.py b/image_classification/MLP-Mixer/main_multi_gpu.py index 0f055b56..e856e496 100644 --- a/image_classification/MLP-Mixer/main_multi_gpu.py +++ b/image_classification/MLP-Mixer/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,53 +27,53 @@ import paddle.distributed as dist from datasets import get_dataloader from datasets import get_dataset -from mlp_mixer import build_mlp_mixer as build_model from utils import AverageMeter from utils import WarmupCosineScheduler from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from mlp_mixer import build_mlp_mixer as build_model -parser = argparse.ArgumentParser('MLP-Mixer') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -arguments = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, arguments) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('MLP-Mixer') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -81,20 +81,28 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + mixup_fn=None, + amp=False, + local_logger=None, + master_logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch debug_steps: int, num of iters to log info, default: 100 accum_iter: int, num of iters for accumulating gradients, default: 1 + mixup_fn: Mixup, mixup instance, default: None amp: bool, if True, use mix precision training, default: False + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: train_loss_meter.avg train_acc_meter.avg @@ -103,6 +111,9 @@ def train(dataloader, model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + master_train_loss_meter = AverageMeter() + master_train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() @@ -110,24 +121,26 @@ def train(dataloader, for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function # Here no division is needed: # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean' - # #loss = loss / accum_iter loss.backward() @@ -136,41 +149,82 @@ def train(dataloader, optimizer.clear_grad() pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) - batch_size = image.shape[0] - train_loss_meter.update(loss.numpy()[0], batch_size) - train_acc_meter.update(acc.numpy()[0], batch_size) + batch_size = paddle.to_tensor(image.shape[0]) - if batch_id % debug_steps == 0: - logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {train_loss_meter.avg:.4f}, " + - f"Avg Acc: {train_acc_meter.avg:.4f}") + # sync from other gpus for overall loss and acc + master_loss = loss.clone() + master_acc = acc.clone() + master_batch_size = batch_size.clone() + dist.all_reduce(master_loss) + dist.all_reduce(master_acc) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc = master_acc / dist.get_world_size() + master_train_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_train_acc_meter.update(master_acc.numpy()[0], master_batch_size.numpy()[0]) - train_time = time.time() - time_st - return train_loss_meter.avg, train_acc_meter.avg, train_time + train_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) + train_acc_meter.update(acc.numpy()[0], batch_size.numpy()[0]) + if batch_id % debug_steps == 0: + if local_logger: + local_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {train_loss_meter.avg:.4f}, " + + f"Avg Acc: {train_acc_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_train_loss_meter.avg:.4f}, " + + f"Avg Acc: {master_train_acc_meter.avg:.4f}") -def validate(dataloader, model, criterion, total_batch, debug_steps=100): + train_time = time.time() - time_st + return (train_loss_meter.avg, + train_acc_meter.avg, + master_train_loss_meter.avg, + master_train_acc_meter.avg, + train_time) + + +def validate(dataloader, + model, + criterion, + total_batch, + debug_steps=100, + local_logger=None, + master_logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + debug_steps: int, num of iters to log info, default: 100 + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + master_val_loss_meter.avg: float, average loss on all processes/gpus + master_val_acc1_meter.avg: float, average top1 accuracy on all processes/gpus + master_val_acc5_meter.avg: float, average top5 accuracy on all processes/gpus + val_time: float, validation time """ model.eval() val_loss_meter = AverageMeter() val_acc1_meter = AverageMeter() val_acc5_meter = AverageMeter() + master_val_loss_meter = AverageMeter() + master_val_acc1_meter = AverageMeter() + master_val_acc5_meter = AverageMeter() time_st = time.time() with paddle.no_grad(): @@ -185,63 +239,140 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1)) acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5) - dist.all_reduce(loss) - dist.all_reduce(acc1) - dist.all_reduce(acc5) - loss = loss / dist.get_world_size() - acc1 = acc1 / dist.get_world_size() - acc5 = acc5 / dist.get_world_size() - batch_size = paddle.to_tensor(image.shape[0]) - dist.all_reduce(batch_size) + + master_loss = loss.clone() + master_acc1 = acc1.clone() + master_acc5 = acc5.clone() + master_batch_size = batch_size.clone() + + dist.all_reduce(master_loss) + dist.all_reduce(master_acc1) + dist.all_reduce(master_acc5) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc1 = master_acc1 / dist.get_world_size() + master_acc5 = master_acc5 / dist.get_world_size() + + master_val_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc1_meter.update(master_acc1.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc5_meter.update(master_acc5.numpy()[0], master_batch_size.numpy()[0]) val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0]) val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0]) if batch_id % debug_steps == 0: - logger.info( - f"Val Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {val_loss_meter.avg:.4f}, " + - f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + - f"Avg Acc@5: {val_acc5_meter.avg:.4f}") - + if local_logger: + local_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {val_acc5_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {master_val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {master_val_acc5_meter.avg:.4f}") val_time = time.time() - time_st - return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time + return (val_loss_meter.avg, + val_acc1_meter.avg, + val_acc5_meter.avg, + master_val_loss_meter.avg, + master_val_acc1_meter.avg, + master_val_acc5_meter.avg, + val_time) def main_worker(*args): - # 0. Preparation + # STEP 0: Preparation + config = args[0] dist.init_parallel_env() last_epoch = config.TRAIN.LAST_EPOCH - world_size = paddle.distributed.get_world_size() - local_rank = paddle.distributed.get_rank() - logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + world_size = dist.get_world_size() + local_rank = dist.get_rank() seed = config.SEED + local_rank paddle.seed(seed) np.random.seed(seed) random.seed(seed) - # 1. Create model + # logger for each process/gpu + local_logger = get_logger( + filename=os.path.join(config.SAVE, 'log_{}.txt'.format(local_rank)), + logger_name='local_logger') + # overall logger + if local_rank == 0: + master_logger = get_logger( + filename=os.path.join(config.SAVE, 'log.txt'), + logger_name='master_logger') + master_logger.info(f'\n{config}') + else: + master_logger = None + local_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + if local_rank == 0: + master_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + + # STEP 1: Create model model = build_model(config) model = paddle.DataParallel(model) - # 2. Create train and val dataloader + + # STEP 2: Create train and val dataloader dataset_train, dataset_val = args[1], args[2] # Create training dataloader if not config.EVAL: dataloader_train = get_dataloader(config, dataset_train, 'train', True) total_batch_train = len(dataloader_train) - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + local_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') if local_rank == 0: - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + master_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') # Create validation dataloader dataloader_val = get_dataloader(config, dataset_val, 'test', True) total_batch_val = len(dataloader_val) - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + local_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') if local_rank == 0: - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define optimizer and lr_scheduler + master_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -263,7 +394,9 @@ def main_worker(*args): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + local_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") if config.TRAIN.OPTIMIZER.NAME == "SGD": @@ -294,77 +427,120 @@ def main_worker(*args): # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 5. Load pretrained model / load resumt model and optimizer states + # STEP 6: Load pretrained model / load resumt model and optimizer states if config.MODEL.PRETRAINED: if (config.MODEL.PRETRAINED).endswith('.pdparams'): raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) - logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + local_logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + if local_rank == 0: + master_logger.info( + f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) - logger.info( - f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}") + local_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + if local_rank == 0: + master_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") - # 6. Validation + # STEP 7: Validation (eval mode) if config.EVAL: - logger.info('----- Start Validating') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info('----- Start Validating') + if local_rank == 0: + master_logger.info('----- Start Validating') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") return - # 6. Start training and validation - logging.info(f"Start training from epoch {last_epoch+1}.") + # STEP 8: Start training and validation (train mode) + local_logger.info(f"Start training from epoch {last_epoch+1}.") + if local_rank == 0: + master_logger.info(f"Start training from epoch {last_epoch+1}.") for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") - train_loss, train_acc, train_time = train(dataloader=dataloader_train, - model=model, - criterion=criterion, - optimizer=optimizer, - epoch=epoch, - total_batch=total_batch_train, - debug_steps=config.REPORT_FREQ, - accum_iter=config.TRAIN.ACCUM_ITER, - amp=config.AMP) + local_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + if local_rank == 0: + master_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + train_loss, train_acc, avg_loss, avg_acc, train_time = train( + dataloader=dataloader_train, + model=model, + criterion=criterion, + optimizer=optimizer, + epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, + total_batch=total_batch_train, + debug_steps=config.REPORT_FREQ, + accum_iter=config.TRAIN.ACCUM_ITER, + mixup_fn=mixup_fn, + amp=config.AMP, + local_logger=local_logger, + master_logger=master_logger) + scheduler.step() - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Train Loss: {train_loss:.4f}, " + - f"Train Acc: {train_acc:.4f}, " + - f"time: {train_time:.2f}") + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {train_loss:.4f}, " + + f"Train Acc: {train_acc:.4f}, " + + f"time: {train_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {avg_loss:.4f}, " + + f"Train Acc: {avg_acc:.4f}, " + + f"time: {train_time:.2f}") + # validation if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: - logger.info(f'----- Validation after Epoch: {epoch}') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info(f'----- Validation after Epoch: {epoch}') + if local_rank == 0: + master_logger.info(f'----- Validation after Epoch: {epoch}') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") # model save if local_rank == 0: if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: @@ -372,18 +548,33 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - logger.info(f"----- Save model: {model_path}.pdparams") - logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") def main(): + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) + + # get dataset and start DDP if not config.EVAL: dataset_train = get_dataset(config, mode='train') else: dataset_train = None dataset_val = get_dataset(config, mode='val') config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS - dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS) + dist.spawn(main_worker, args=(config, dataset_train, dataset_val, ), nprocs=config.NGPUS) if __name__ == "__main__": diff --git a/image_classification/MLP-Mixer/main_single_gpu.py b/image_classification/MLP-Mixer/main_single_gpu.py index dff2c21e..e4a82077 100644 --- a/image_classification/MLP-Mixer/main_single_gpu.py +++ b/image_classification/MLP-Mixer/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,55 +26,54 @@ import paddle.nn.functional as F from datasets import get_dataloader from datasets import get_dataset -from mlp_mixer import build_mlp_mixer as build_model from utils import AverageMeter from utils import WarmupCosineScheduler +from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from mlp_mixer import build_mlp_mixer as build_model -parser = argparse.ArgumentParser('MLP-Mixer') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -args = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, args) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -#config.freeze() - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('MLP-Mixer') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -83,49 +81,57 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + mixup_fn=None, + amp=False, + logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info - accum_iter: int, num of iters for accumulating gradients - amp: bool, if True, use mix precision training + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + accum_iter: int, num of iters for accumulating gradients, default: 1 + mixup_fn: Mixup, mixup instance, default: None + amp: bool, if True, use mix precision training, default: False + logger: logger for logging, default: None Returns: - train_loss_meter.avg - train_acc_meter.avg - train_time + train_loss_meter.avg: float, average loss on current process/gpu + train_acc_meter.avg: float, average top1 accuracy on current process/gpu + train_time: float, training time """ model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() - for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function @@ -139,15 +145,18 @@ def train(dataloader, optimizer.clear_grad() pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) batch_size = image.shape[0] train_loss_meter.update(loss.numpy()[0], batch_size) train_acc_meter.update(acc.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + f"Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {train_loss_meter.avg:.4f}, " + f"Avg Acc: {train_acc_meter.avg:.4f}") @@ -156,19 +165,20 @@ def train(dataloader, return train_loss_meter.avg, train_acc_meter.avg, train_time -def validate(dataloader, model, criterion, total_batch, debug_steps=100): +def validate(dataloader, model, criterion, total_batch, debug_steps=100, logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + logger: logger for logging, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + val_time: float, valitaion time """ model.eval() val_loss_meter = AverageMeter() @@ -193,7 +203,7 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): val_acc1_meter.update(acc1.numpy()[0], batch_size) val_acc5_meter.update(acc5.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( f"Val Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {val_loss_meter.avg:.4f}, " + @@ -205,25 +215,77 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): def main(): - # 0. Preparation + # STEP 0: Preparation + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) last_epoch = config.TRAIN.LAST_EPOCH seed = config.SEED paddle.seed(seed) np.random.seed(seed) random.seed(seed) - #paddle.set_device('gpu:0') - # 1. Create model + logger = get_logger(filename=os.path.join(config.SAVE, 'log.txt')) + logger.info(f'\n{config}') + + # STEP 1: Create model model = build_model(config) - # 2. Create train and val dataloader + + # STEP 2: Create train and val dataloader if not config.EVAL: dataset_train = get_dataset(config, mode='train') dataloader_train = get_dataloader(config, dataset_train, 'train', False) dataset_val = get_dataset(config, mode='val') dataloader_val = get_dataloader(config, dataset_val, 'val', False) - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define lr_scheduler + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -232,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -245,9 +306,9 @@ def main(): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") - # 5. Define optimizer + if config.TRAIN.OPTIMIZER.NAME == "SGD": if config.TRAIN.GRAD_CLIP: clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP) @@ -267,18 +328,21 @@ def main(): optimizer = paddle.optimizer.AdamW( parameters=model.parameters(), learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR, - weight_decay=config.TRAIN.WEIGHT_DECAY, beta1=config.TRAIN.OPTIMIZER.BETAS[0], beta2=config.TRAIN.OPTIMIZER.BETAS[1], + weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 6. Load pretrained model or load resume model and optimizer states + + # STEP 6: Load pretrained model or load resume model and optimizer states if config.MODEL.PRETRAINED: - assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') - model_state = paddle.load(config.MODEL.PRETRAINED + '.pdparams') + if (config.MODEL.PRETRAINED).endswith('.pdparams'): + raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') + assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True + model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") @@ -291,35 +355,40 @@ def main(): optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") - # 7. Validation + + # STEP 7: Validation (eval mode) if config.EVAL: logger.info('----- Start Validating') val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + f"Validation Acc@5: {val_acc5:.4f}, " + f"time: {val_time:.2f}") return - # 8. Start training and validation - logging.info(f"Start training from epoch {last_epoch + 1}.") - for epoch in range(last_epoch + 1, config.TRAIN.NUM_EPOCHS + 1): + + # STEP 8: Start training and validation (train mode) + logger.info(f"Start training from epoch {last_epoch+1}.") + for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") train_loss, train_acc, train_time = train(dataloader=dataloader_train, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, total_batch=len(dataloader_train), debug_steps=config.REPORT_FREQ, accum_iter=config.TRAIN.ACCUM_ITER, + mixup_fn=mixup_fn, amp=config.AMP, - ) + logger=logger) scheduler.step() logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Train Loss: {train_loss:.4f}, " + @@ -331,9 +400,10 @@ def main(): val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + diff --git a/image_classification/MLP-Mixer/mixup.py b/image_classification/MLP-Mixer/mixup.py new file mode 100644 index 00000000..1d2db493 --- /dev/null +++ b/image_classification/MLP-Mixer/mixup.py @@ -0,0 +1,225 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""mixup and cutmix for batch data""" +import numpy as np +import paddle + + +def rand_bbox(image_shape, lam, count=None): + """ CutMix bbox by lam value + Generate 1 random bbox by value lam. lam is the cut size rate. + The cut_size is computed by sqrt(1-lam) * image_size. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + count: int, number of bbox to generate + """ + image_h, image_w = image_shape[-2:] + cut_rate = np.sqrt(1. - lam) + cut_h = int(cut_rate * image_h) + cut_w = int(cut_rate * image_w) + + # get random bbox center + cy = np.random.randint(0, image_h, size=count) + cx = np.random.randint(0, image_w, size=count) + + # get bbox coords + bbox_x1 = np.clip(cx - cut_w // 2, 0, image_w) + bbox_y1 = np.clip(cy - cut_h // 2, 0, image_h) + bbox_x2 = np.clip(cx + cut_w // 2, 0, image_w) + bbox_y2 = np.clip(cy + cut_h // 2, 0, image_h) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # while in pytorch, it will return [] tensor + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def rand_bbox_minmax(image_shape, minmax, count=None): + """ CutMix bbox by min and max value + Generate 1 random bbox by min and max percentage values. + Minmax is a tuple/list of min and max percentage vlaues + applied to the image width and height. + + Args: + image_shape: tuple/list, image height and width + minmax: tuple/list, min and max percentage values of image size + count: int, number of bbox to generate + """ + assert len(minmax) == 2 + image_h, image_w = image_shape[-2:] + min_ratio = minmax[0] + max_ratio = minmax[1] + cut_h = np.random.randint(int(image_h * min_ratio), int(image_h * max_ratio), size=count) + cut_w = np.random.randint(int(image_w * min_ratio), int(image_w * max_ratio), size=count) + + bbox_x1 = np.random.randint(0, image_w - cut_w, size=count) + bbox_y1 = np.random.randint(0, image_h - cut_h, size=count) + bbox_x2 = bbox_x1 + cut_w + bbox_y2 = bbox_y1 + cut_h + + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def cutmix_generate_bbox_adjust_lam(image_shape, lam, minmax=None, correct_lam=True, count=None): + """Generate bbox and apply correction for lambda + If the mimmax is None, apply the standard cutmix by lam value, + If the minmax is set, apply the cutmix by min and max percentage values. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + minmax: tuple/list, min and max percentage values of image size + correct_lam: bool, if True, correct the lam value by the generated bbox + count: int, number of bbox to generate + """ + if minmax is not None: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox_minmax(image_shape, minmax, count) + else: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox(image_shape, lam, count) + + if correct_lam or minmax is not None: + image_h, image_w = image_shape[-2:] + bbox_area = (bbox_y2 - bbox_y1) * (bbox_x2 - bbox_x1) + lam = 1. - bbox_area / float(image_h * image_w) + return (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam + + +def one_hot(x, num_classes, on_value=1., off_value=0.): + """ Generate one-hot vector for label smoothing + Args: + x: tensor, contains label/class indices + num_classes: int, num of classes (len of the one-hot vector) + on_value: float, the vector value at label index, default=1. + off_value: float, the vector value at non-label indices, default=0. + Returns: + one_hot: tensor, tensor with on value at label index and off value + at non-label indices. + """ + x = x.reshape_([-1, 1]) + x_smoothed = paddle.full((x.shape[0], num_classes), fill_value=off_value) + for i in range(x.shape[0]): + x_smoothed[i, x[i]] = on_value + return x_smoothed + + +def mixup_one_hot(label, num_classes, lam=1., smoothing=0.): + """ mixup and label smoothing in batch + label smoothing is firstly applied, then + mixup is applied by mixing the bacth and its flip, + with a mixup rate. + + Args: + label: tensor, label tensor with shape [N], contains the class indices + num_classes: int, num of all classes + lam: float, mixup rate, default=1.0 + smoothing: float, label smoothing rate + """ + off_value = smoothing / num_classes + on_value = 1. - smoothing + off_value + y1 = one_hot(label, num_classes, on_value, off_value) + y2 = one_hot(label.flip(axis=[0]), num_classes, on_value, off_value) + return y2 * (1 - lam) + y1 * lam + + +class Mixup: + """Mixup class + Args: + mixup_alpha: float, mixup alpha for beta distribution, default=1.0, + cutmix_alpha: float, cutmix alpha for beta distribution, default=0.0, + cutmix_minmax: list/tuple, min and max value for cutmix ratio, default=None, + prob: float, if random prob < prob, do not use mixup, default=1.0, + switch_prob: float, prob of switching mixup and cutmix, default=0.5, + mode: string, mixup up, now only 'batch' is supported, default='batch', + correct_lam: bool, if True, apply correction of lam, default=True, + label_smoothing: float, label smoothing rate, default=0.1, + num_classes: int, num of classes, default=1000 + """ + def __init__(self, + mixup_alpha=1.0, + cutmix_alpha=0.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode='batch', + correct_lam=True, + label_smoothing=0.1, + num_classes=1000): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if cutmix_minmax is not None: + assert len(cutmix_minmax) == 2 + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = correct_lam + assert mode == 'batch', 'Now only batch mode is supported!' + + def __call__(self, x, target): + assert x.shape[0] % 2 == 0, "Batch size should be even" + lam = self._mix_batch(x) + target = mixup_one_hot(target, self.num_classes, lam, self.label_smoothing) + return x, target + + def get_params(self): + """Decide to use cutmix or regular mixup by sampling and + sample lambda for mixup + """ + lam = 1. + use_cutmix = False + use_mixup = np.random.rand() < self.mix_prob + if use_mixup: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand() < self.switch_prob + alpha = self.cutmix_alpha if use_cutmix else self.mixup_alpha + lam_mix = np.random.beta(alpha, alpha) + elif self.mixup_alpha == 0. and self.cutmix_alpha > 0.: + use_cutmix=True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + elif self.mixup_alpha > 0. and self.cutmix_alpha == 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + else: + raise ValueError('mixup_alpha and cutmix_alpha cannot be all 0') + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_batch(self, x): + """mixup/cutmix by adding batch data and its flipped version""" + lam, use_cutmix = self.get_params() + if lam == 1.: + return lam + if use_cutmix: + (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam = cutmix_generate_bbox_adjust_lam( + x.shape, + lam, + minmax=self.cutmix_minmax, + correct_lam=self.correct_lam) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # but in pytorch, it will return [] tensor without errors + if int(bbox_x1) != int(bbox_x2) and int(bbox_y1) != int(bbox_y2): + x[:, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] = x.flip(axis=[0])[ + :, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] + else: + x_flipped = x.flip(axis=[0]) + x_flipped = x_flipped * (1 - lam) + x.set_value(x * (lam) + x_flipped) + return lam diff --git a/image_classification/MLP-Mixer/mlp_mixer.py b/image_classification/MLP-Mixer/mlp_mixer.py index 287ff846..9985c8f1 100644 --- a/image_classification/MLP-Mixer/mlp_mixer.py +++ b/image_classification/MLP-Mixer/mlp_mixer.py @@ -239,5 +239,5 @@ def build_mlp_mixer(config): embed_dim=config.MODEL.MIXER.HIDDEN_SIZE, mlp_ratio=(0.5, 4.0), dropout=config.MODEL.DROPOUT, - droppath=config.MODEL.DROPPATH) + droppath=config.MODEL.DROP_PATH) return model diff --git a/image_classification/MLP-Mixer/random_erasing.py b/image_classification/MLP-Mixer/random_erasing.py new file mode 100644 index 00000000..31eea465 --- /dev/null +++ b/image_classification/MLP-Mixer/random_erasing.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Random Erasing for image tensor""" + +import random +import math +import paddle + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"): + if per_pixel: + return paddle.normal(shape=patch_size).astype(dtype) + if rand_color: + return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype) + return paddle.zeros((patch_size[0], 1, 1)).astype(dtype) + + +class RandomErasing(object): + """ + Args: + prob: probability of performing random erasing + min_area: Minimum percentage of erased area wrt input image area + max_area: Maximum percentage of erased area wrt input image area + min_aspect: Minimum aspect ratio of earsed area + max_aspect: Maximum aspect ratio of earsed area + mode: pixel color mode, in ['const', 'rand', 'pixel'] + 'const' - erase block is constant valued 0 for all channels + 'rand' - erase block is valued random color (same per-channel) + 'pixel' - erase block is vauled random color per pixel + min_count: Minimum # of ereasing blocks per image. + max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count + per-image count is randomly chosen between min_count to max_count + """ + def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0): + self.prob = prob + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == "rand": + self.rand_color = True + elif mode == "pixel": + self.per_pixel = True + else: + assert not mode or mode == "const" + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.prob: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top+h, left:left+w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype) + break + + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) + else: + batch_size, chan, img_h, img_w = input.shape + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input + + + +#def main(): +# re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel') +# import PIL.Image as Image +# import numpy as np +# paddle.set_device('cpu') +# img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32') +# img = img / 255.0 +# img = paddle.transpose(img, [2, 0, 1]) +# new_img = re(img) +# new_img = new_img * 255.0 +# new_img = paddle.transpose(new_img, [1, 2, 0]) +# new_img = new_img.cpu().numpy() +# new_img = Image.fromarray(new_img.astype('uint8')) +# new_img.save('./res.png') +# +# +# +#if __name__ == "__main__": +# main() diff --git a/image_classification/MLP-Mixer/transforms.py b/image_classification/MLP-Mixer/transforms.py new file mode 100644 index 00000000..5a046912 --- /dev/null +++ b/image_classification/MLP-Mixer/transforms.py @@ -0,0 +1,14 @@ +import random +import paddle +import paddle.nn +import paddle.vision.transforms as T + + +class RandomHorizontalFlip(): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, image): + if random.random() < self.p: + return T.hflip(image) + return image diff --git a/image_classification/MobileViT/augment.py b/image_classification/MobileViT/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/MobileViT/augment.py +++ b/image_classification/MobileViT/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/MobileViT/datasets.py b/image_classification/MobileViT/datasets.py index 2a0ed6e4..00fb9294 100644 --- a/image_classification/MobileViT/datasets.py +++ b/image_classification/MobileViT/datasets.py @@ -28,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing from multi_scale_sampler import MultiScaleSamplerDDP @@ -110,6 +112,10 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 aug_op_list.append(transforms.ColorJitter(*jitter)) diff --git a/image_classification/MobileViT/main_multi_gpu.py b/image_classification/MobileViT/main_multi_gpu.py index 6a8a6766..d51022fb 100644 --- a/image_classification/MobileViT/main_multi_gpu.py +++ b/image_classification/MobileViT/main_multi_gpu.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""CSwin training/validation using multiple GPU """ +"""MobileViT training/validation using multiple GPU """ import sys import os @@ -42,7 +42,7 @@ def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('CSwin') + parser = argparse.ArgumentParser('MobileViT') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) diff --git a/image_classification/PVTv2/augment.py b/image_classification/PVTv2/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/PVTv2/augment.py +++ b/image_classification/PVTv2/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/PVTv2/config.py b/image_classification/PVTv2/config.py index 91e92a86..18b609b8 100644 --- a/image_classification/PVTv2/config.py +++ b/image_classification/PVTv2/config.py @@ -69,11 +69,12 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 5 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.BASE_LR = 0.0005 _C.TRAIN.WARMUP_START_LR = 1e-6 _C.TRAIN.END_LR = 1e-5 _C.TRAIN.GRAD_CLIP = None _C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.MODEL_EMA = False _C.TRAIN.MODEL_EMA_DECAY = 0.99992 @@ -99,27 +100,14 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" diff --git a/image_classification/PVTv2/configs/pvtv2_b0.yaml b/image_classification/PVTv2/configs/pvtv2_b0.yaml index c8854b95..69ab355c 100644 --- a/image_classification/PVTv2/configs/pvtv2_b0.yaml +++ b/image_classification/PVTv2/configs/pvtv2_b0.yaml @@ -12,7 +12,7 @@ MODEL: MLP_RATIO: [8, 8, 4, 4] SR_RATIO: [8, 4, 2, 1] QKV_BIAS: True - DROP_PATH: 0.1 + DROPPATH: 0.1 TRAIN: GRAD_CLIP: None diff --git a/image_classification/PVTv2/configs/pvtv2_b1.yaml b/image_classification/PVTv2/configs/pvtv2_b1.yaml index 95135935..69a7a1ba 100644 --- a/image_classification/PVTv2/configs/pvtv2_b1.yaml +++ b/image_classification/PVTv2/configs/pvtv2_b1.yaml @@ -12,7 +12,7 @@ MODEL: MLP_RATIO: [8, 8, 4, 4] SR_RATIO: [8, 4, 2, 1] QKV_BIAS: True - DROP_PATH: 0.1 + DROPPATH: 0.1 TRAIN: GRAD_CLIP: None diff --git a/image_classification/PVTv2/configs/pvtv2_b2.yaml b/image_classification/PVTv2/configs/pvtv2_b2.yaml index 5102f3d3..b6871317 100644 --- a/image_classification/PVTv2/configs/pvtv2_b2.yaml +++ b/image_classification/PVTv2/configs/pvtv2_b2.yaml @@ -12,7 +12,7 @@ MODEL: MLP_RATIO: [8, 8, 4, 4] SR_RATIO: [8, 4, 2, 1] QKV_BIAS: True - DROP_PATH: 0.1 + DROPPATH: 0.1 TRAIN: GRAD_CLIP: None diff --git a/image_classification/PVTv2/configs/pvtv2_b2_linear.yaml b/image_classification/PVTv2/configs/pvtv2_b2_linear.yaml index 10e8384c..82bcd1b3 100644 --- a/image_classification/PVTv2/configs/pvtv2_b2_linear.yaml +++ b/image_classification/PVTv2/configs/pvtv2_b2_linear.yaml @@ -13,7 +13,7 @@ MODEL: SR_RATIO: [8, 4, 2, 1] LINEAR: True QKV_BIAS: True - DROP_PATH: 0.1 + DROPPATH: 0.1 TRAIN: GRAD_CLIP: None diff --git a/image_classification/PVTv2/configs/pvtv2_b3.yaml b/image_classification/PVTv2/configs/pvtv2_b3.yaml index 823a1889..75a21f47 100644 --- a/image_classification/PVTv2/configs/pvtv2_b3.yaml +++ b/image_classification/PVTv2/configs/pvtv2_b3.yaml @@ -12,7 +12,7 @@ MODEL: MLP_RATIO: [8, 8, 4, 4] SR_RATIO: [8, 4, 2, 1] QKV_BIAS: True - DROP_PATH: 0.3 + DROPPATH: 0.3 TRAIN: GRAD_CLIP: 1.0 diff --git a/image_classification/PVTv2/configs/pvtv2_b4.yaml b/image_classification/PVTv2/configs/pvtv2_b4.yaml index f8f3472e..ce0aef13 100644 --- a/image_classification/PVTv2/configs/pvtv2_b4.yaml +++ b/image_classification/PVTv2/configs/pvtv2_b4.yaml @@ -12,7 +12,7 @@ MODEL: MLP_RATIO: [8, 8, 4, 4] SR_RATIO: [8, 4, 2, 1] QKV_BIAS: True - DROP_PATH: 0.3 + DROPPATH: 0.3 TRAIN: GRAD_CLIP: 1.0 diff --git a/image_classification/PVTv2/configs/pvtv2_b5.yaml b/image_classification/PVTv2/configs/pvtv2_b5.yaml index fea21eb1..0c2a9766 100644 --- a/image_classification/PVTv2/configs/pvtv2_b5.yaml +++ b/image_classification/PVTv2/configs/pvtv2_b5.yaml @@ -12,7 +12,7 @@ MODEL: MLP_RATIO: [4, 4, 4, 4] SR_RATIO: [8, 4, 2, 1] QKV_BIAS: True - DROP_PATH: 0.3 + DROPPATH: 0.3 TRAIN: GRAD_CLIP: 1.0 diff --git a/image_classification/PVTv2/datasets.py b/image_classification/PVTv2/datasets.py index ed6a8450..23c0b1f3 100644 --- a/image_classification/PVTv2/datasets.py +++ b/image_classification/PVTv2/datasets.py @@ -28,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -100,9 +102,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: - jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/PVTv2/pvtv2.py b/image_classification/PVTv2/pvtv2.py index 68b98ae9..3896978d 100644 --- a/image_classification/PVTv2/pvtv2.py +++ b/image_classification/PVTv2/pvtv2.py @@ -97,11 +97,13 @@ def __init__(self, image_size=224, patch_size=7, stride=4, in_channels=3, embed_ kernel_size=patch_size, stride=stride, padding=(patch_size[0] // 2, patch_size[1] // 2)) - self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) + + w_attr_1, b_attr_1 = self._init_weights() + self.norm = nn.LayerNorm(embed_dim, weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6) def _init_weights(self): - weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()) - bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -254,12 +256,12 @@ def __init__(self, def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) return weight_attr, bias_attr def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) return weight_attr, bias_attr def _init_weights_conv(self): @@ -335,15 +337,15 @@ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() w_attr_2, b_attr_2 = self._init_weights_layernorm() # init for layernorm - self.norm2 = nn.LayerNorm(dim, epsilon=1e-6, weight_attr=w_attr_2, bias_atrr=b_attr_2) + self.norm2 = nn.LayerNorm(dim, epsilon=1e-6, weight_attr=w_attr_2, bias_attr=b_attr_2) self.mlp = Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), dropout=dropout, linear=linear) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x, H, W): @@ -456,12 +458,12 @@ def __init__(self, def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) return weight_attr, bias_attr def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) return weight_attr, bias_attr def freeze_patch_embedding(self): @@ -506,6 +508,6 @@ def build_pvtv2(config): qk_scale=config.MODEL.TRANS.QK_SCALE, dropout=config.MODEL.DROPOUT, attention_dropout=config.MODEL.ATTENTION_DROPOUT, - drop_path=config.MODEL.DROP_PATH, + drop_path=config.MODEL.DROPPATH, linear=config.MODEL.TRANS.LINEAR) return model diff --git a/image_classification/PiT/augment.py b/image_classification/PiT/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/PiT/augment.py +++ b/image_classification/PiT/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/PiT/config.py b/image_classification/PiT/config.py index 6e040543..051fe2ad 100644 --- a/image_classification/PiT/config.py +++ b/image_classification/PiT/config.py @@ -96,12 +96,13 @@ _C.TRAIN.MIXUP_MODE = 'batch' _C.TRAIN.SMOOTHING = 0.1 -_C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.COLOR_JITTER = 0.4 # color jitter factor +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True -_C.TRAIN.RANDOM_ERASE_PROB = 0.25 -_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' -_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 # random erase prob +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # random erase mode +_C.TRAIN.RANDOM_ERASE_COUNT = 1 # random erase count _C.TRAIN.RANDOM_ERASE_SPLIT = False _C.TRAIN.DISTILLATION_TYPE = 'hard' # hard, soft, none @@ -109,23 +110,6 @@ _C.TRAIN.DISTILLATION_TAU = 1.0 _C.TRAIN.TEACHER_MODEL = './regnety_160' # no ext is needed -_C.TRAIN.MODEL_EMA = True -_C.TRAIN.MODEL_EMA_DECAY = 0.99996 - -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" diff --git a/image_classification/PiT/datasets.py b/image_classification/PiT/datasets.py index 2f0d6e61..7e178b57 100644 --- a/image_classification/PiT/datasets.py +++ b/image_classification/PiT/datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -69,7 +72,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') + data = Image.open(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -99,6 +102,10 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 aug_op_list.append(transforms.ColorJitter(*jitter)) @@ -158,11 +165,13 @@ def get_dataset(config, mode='train'): if mode == 'train': dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config)) else: + mode = 'test' dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config)) elif config.DATA.DATASET == "cifar100": if mode == 'train': dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config)) else: + mode = 'test' dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config)) elif config.DATA.DATASET == "imagenet2012": if mode == 'train': diff --git a/image_classification/PiT/main_multi_gpu.py b/image_classification/PiT/main_multi_gpu.py index 6f393901..ea327f22 100644 --- a/image_classification/PiT/main_multi_gpu.py +++ b/image_classification/PiT/main_multi_gpu.py @@ -357,7 +357,8 @@ def main_worker(*args): prob=config.TRAIN.MIXUP_PROB, switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, mode=config.TRAIN.MIXUP_MODE, - label_smoothing=config.TRAIN.SMOOTHING) + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) # STEP 4: Define criterion if config.TRAIN.MIXUP_PROB > 0.: diff --git a/image_classification/PiT/main_single_gpu.py b/image_classification/PiT/main_single_gpu.py index ba81e5f8..67ba96d6 100644 --- a/image_classification/PiT/main_single_gpu.py +++ b/image_classification/PiT/main_single_gpu.py @@ -269,7 +269,8 @@ def main(): prob=config.TRAIN.MIXUP_PROB, switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, mode=config.TRAIN.MIXUP_MODE, - label_smoothing=config.TRAIN.SMOOTHING) + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) # STEP 4: Define criterion if config.TRAIN.MIXUP_PROB > 0.: @@ -304,19 +305,23 @@ def main(): # STEP 6: Define optimizer and lr_scheduler # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -386,12 +391,12 @@ def main(): assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") # load ema model - if model_ema is not None and os.path.isfidile(config.MODEL.RESUME + '-EMA.pdparams'): + if model_ema is not None and os.path.isfile(config.MODEL.RESUME + '-EMA.pdparams'): model_ema_state = paddle.load(config.MODEL.RESUME + '-EMA.pdparams') model_ema.module.set_state_dict(model_ema_state) logger.info(f'----- Load model ema from {config.MODEL.RESUME}-EMA.pdparams') diff --git a/image_classification/PoolFormer/augment.py b/image_classification/PoolFormer/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/PoolFormer/augment.py +++ b/image_classification/PoolFormer/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/PoolFormer/config.py b/image_classification/PoolFormer/config.py index 8124da13..551c2114 100644 --- a/image_classification/PoolFormer/config.py +++ b/image_classification/PoolFormer/config.py @@ -64,12 +64,14 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 5 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 0.001 +_C.TRAIN.BASE_LR = 4e-3 _C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 _C.TRAIN.END_LR = 5e-4 -_C.TRAIN.GRAD_CLIP = 1.0 +_C.TRAIN.GRAD_CLIP = None _C.TRAIN.ACCUM_ITER = 1 -_C.TRAIN.LINEAR_SCALED_LR = None +_C.TRAIN.MODEL_EMA = False +_C.TRAIN.MODEL_EMA_DECAY = 0.99992 +_C.TRAIN.LINEAR_SCALED_LR = 1024 _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -94,6 +96,7 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 _C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' @@ -101,19 +104,6 @@ _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" _C.TAG = "default" diff --git a/image_classification/PoolFormer/datasets.py b/image_classification/PoolFormer/datasets.py index 48523f6f..241f81b7 100644 --- a/image_classification/PoolFormer/datasets.py +++ b/image_classification/PoolFormer/datasets.py @@ -20,10 +20,16 @@ import os import math from PIL import Image -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from random_erasing import RandomErasing @@ -94,9 +100,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/PoolFormer/main_multi_gpu.py b/image_classification/PoolFormer/main_multi_gpu.py index 131c2d9e..cb4e2de3 100644 --- a/image_classification/PoolFormer/main_multi_gpu.py +++ b/image_classification/PoolFormer/main_multi_gpu.py @@ -555,11 +555,8 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - local_logger.info(f"----- Save model: {model_path}.pdparams") - local_logger.info(f"----- Save optim: {model_path}.pdopt") - if local_rank == 0: - master_logger.info(f"----- Save model: {model_path}.pdparams") - master_logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") def main(): diff --git a/image_classification/PoolFormer/main_single_gpu.py b/image_classification/PoolFormer/main_single_gpu.py index 7b489e84..71a4fcf8 100644 --- a/image_classification/PoolFormer/main_single_gpu.py +++ b/image_classification/PoolFormer/main_single_gpu.py @@ -266,11 +266,14 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - if config.TRAIN.LINEAR_SCALED_LR: - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 1024.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 1024.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 1024.0 + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR if config.TRAIN.ACCUM_ITER > 1: linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER @@ -280,7 +283,7 @@ def main(): config.TRAIN.BASE_LR = linear_scaled_lr config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr config.TRAIN.END_LR = linear_scaled_end_lr - + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, diff --git a/image_classification/PoolFormer/run_train_multi.sh b/image_classification/PoolFormer/run_train_multi.sh index 02ed04cc..06cbfa13 100644 --- a/image_classification/PoolFormer/run_train_multi.sh +++ b/image_classification/PoolFormer/run_train_multi.sh @@ -4,4 +4,4 @@ python main_multi_gpu.py \ -dataset='imagenet2012' \ -batch_size=16 \ -data_path='/dataset/imagenet' \ - -amp +# -amp diff --git a/image_classification/RepMLP/augment.py b/image_classification/RepMLP/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/RepMLP/augment.py +++ b/image_classification/RepMLP/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/RepMLP/config.py b/image_classification/RepMLP/config.py index c4408404..cf2e580d 100644 --- a/image_classification/RepMLP/config.py +++ b/image_classification/RepMLP/config.py @@ -46,7 +46,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 - +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.ATTENTION_DROPOUT = 0.0 +_C.MODEL.DROP_PATH = 0.1 # transformer settings _C.MODEL.MIXER = CN() @@ -67,11 +69,12 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 20 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.BASE_LR = 0.001 _C.TRAIN.WARMUP_START_LR = 5e-7 _C.TRAIN.END_LR = 5e-6 _C.TRAIN.GRAD_CLIP = 5.0 _C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -95,33 +98,20 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" -_C.SAVE_FREQ = 1 # freq to save chpt +_C.SAVE_FREQ = 20 # freq to save chpt _C.REPORT_FREQ = 50 # freq to logging info -_C.VALIDATE_FREQ = 10 # freq to do validation +_C.VALIDATE_FREQ = 20 # freq to do validation _C.SEED = 0 _C.EVAL = False # run evaluation only _C.AMP = False @@ -158,6 +148,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/RepMLP/datasets.py b/image_classification/RepMLP/datasets.py index b120fa00..304df9a3 100644 --- a/image_classification/RepMLP/datasets.py +++ b/image_classification/RepMLP/datasets.py @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -99,9 +102,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/RepMLP/droppath.py b/image_classification/RepMLP/droppath.py new file mode 100644 index 00000000..c8fe8048 --- /dev/null +++ b/image_classification/RepMLP/droppath.py @@ -0,0 +1,50 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth +""" + +import paddle +import paddle.nn as nn + +def drop_path(inputs, drop_prob=0., training=False): + """drop path op + Args: + input: tensor with arbitrary shape + drop_prob: float number of drop path probability, default: 0.0 + training: bool, if current mode is training, default: False + Returns: + output: output tensor after drop path + """ + # if prob is 0 or eval mode, return original input + if drop_prob == 0. or not training: + return inputs + keep_prob = 1 - drop_prob + keep_prob = paddle.to_tensor(keep_prob) + shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) + random_tensor = random_tensor.floor() # mask + output = inputs.divide(keep_prob) * random_tensor # divide is to keep same output expectation + return output + + +class DropPath(nn.Layer): + """DropPath class""" + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, inputs): + return drop_path(inputs, self.drop_prob, self.training) diff --git a/image_classification/RepMLP/main_multi_gpu.py b/image_classification/RepMLP/main_multi_gpu.py index e1e4b69f..09ca1426 100644 --- a/image_classification/RepMLP/main_multi_gpu.py +++ b/image_classification/RepMLP/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,19 +29,17 @@ from datasets import get_dataset from utils import AverageMeter from utils import WarmupCosineScheduler -from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from repmlp_resnet import build_repmlp_resnet as build_model def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('RepMLP') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -105,11 +104,9 @@ def train(dataloader, local_logger: logger for local process/gpu, default: None master_logger: logger for main process, default: None Returns: - train_loss_meter.avg: float, average loss on current process/gpu - train_acc_meter.avg: float, average top1 accuracy on current process/gpu - master_train_loss_meter.avg: float, average loss on all processes/gpus - master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus - train_time: float, training time + train_loss_meter.avg + train_acc_meter.avg + train_time """ model.train() train_loss_meter = AverageMeter() @@ -132,7 +129,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -358,22 +355,23 @@ def main_worker(*args): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -425,8 +423,8 @@ def main_worker(*args): weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), + #apply_decay_param_fun=get_exclude_from_weight_decay_fn([ + # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -447,9 +445,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/RepMLP/main_single_gpu.py b/image_classification/RepMLP/main_single_gpu.py index b142ce07..2e919da1 100644 --- a/image_classification/RepMLP/main_single_gpu.py +++ b/image_classification/RepMLP/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,13 +34,12 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from repmlp_resnet import build_repmlp_resnet as build_model def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('RepMLP') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -126,7 +125,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -190,7 +189,7 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100, logger= with paddle.no_grad(): for batch_id, data in enumerate(dataloader): image = data[0] - label = data[1].astype('int64') + label = data[1] output = model(image) loss = criterion(output, label) @@ -269,19 +268,23 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -291,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -330,10 +332,7 @@ def main(): beta2=config.TRAIN.OPTIMIZER.BETAS[1], weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, - grad_clip=clip, - apply_decay_param_fun=get_exclude_from_weight_decay_fn([ - 'absolute_pos_embed', 'relative_position_bias_table']), - ) + grad_clip=clip) else: logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") @@ -348,11 +347,11 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") diff --git a/image_classification/RepMLP/transforms.py b/image_classification/RepMLP/transforms.py index 676fe1ff..5a046912 100644 --- a/image_classification/RepMLP/transforms.py +++ b/image_classification/RepMLP/transforms.py @@ -1,3 +1,4 @@ +import random import paddle import paddle.nn import paddle.vision.transforms as T diff --git a/image_classification/ResMLP/augment.py b/image_classification/ResMLP/augment.py new file mode 100644 index 00000000..7a7f081c --- /dev/null +++ b/image_classification/ResMLP/augment.py @@ -0,0 +1,285 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Augmentation""" +""" Rand Augmentation """ +# reference: RandAugment: Practical automated data augmentation with a reduced search space +# https://arxiv.org/abs/1909.13719 + +""" Auto Augmentation """ +# reference: AutoAugment: Learning Augmentation Policies from Data +# https://arxiv.org/abs/1805.09501 + +import random +import numpy as np +from PIL import Image, ImageEnhance, ImageOps + + +def auto_augment_policy_original(): + """25 types of augment policies in original paper""" + policy = [ + [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy] + return policy + + +def rand_augment_policy_original(magnitude_idx=9): + """ + 14 types of augment policies in original paper + Args: + magnitude_idx: M + """ + policy = [ + ('Posterize', 1, magnitude_idx), ('Rotate', 1, magnitude_idx), + ('Solarize', 1, magnitude_idx), ('AutoContrast', 1, magnitude_idx), + ('Equalize', 1, magnitude_idx), ('Contrast', 1, magnitude_idx), + ('Color', 1, magnitude_idx), ('Invert', 1, magnitude_idx), + ('Sharpness', 1, magnitude_idx), ('Brightness', 1, magnitude_idx), + ('ShearX', 1, magnitude_idx), ('ShearY', 1, magnitude_idx), + ('TranslateX', 1, magnitude_idx), ('TranslateY', 1, magnitude_idx), + ] + policy = [SubPolicy(*args) for args in policy] + return policy + + +class AutoAugment(): + """Auto Augment + Randomly choose a tuple of augment ops from a list of policy + Then apply the tuple of augment ops to input image + + Examples: + policy = auto_augment_policy_original() + augment = AutoAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy): + self.policy = policy + + def __call__(self, image, policy_idx=None): + if policy_idx is None: + policy_idx = random.randint(0, len(self.policy) - 1) + + sub_policy = self.policy[policy_idx] + for op in sub_policy: + image = op(image) + return image + + +class RandAugment(): + """Rand Augment + Randomly choose N augment ops from a list of K policies + Then apply the N ops to input image + + Examples: + policy = rand_augment_policy_original(magnitude_idx) + augment = RandAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy, num_layers=2): + """ + Args: + policy: list of SubPolicy + num_layers: int + """ + self.policy = policy + self.num_layers = num_layers + + def __call__(self, image): + selected_idx = np.random.choice(len(self.policy), self.num_layers) + + for policy_idx in selected_idx: + sub_policy = self.policy[policy_idx] + image = sub_policy(image) + return image + + +class SubPolicy: + """Subpolicy + Read augment name and magnitude, apply augment with probability + Args: + op_name: str, augment operation name + prob: float, if prob > random prob, apply augment + magnitude_idx: int, index of magnitude in preset magnitude ranges + """ + + def __init__(self, op_name, prob, magnitude_idx): + # ranges of operations' magnitude + ranges = { + 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'TranslateX': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'TranslateY': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'Rotate': np.linspace(0, 30, 10), # [-30, 30] (by random negative) + 'Color': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), # [0, 4] + 'Solarize': np.linspace(256, 0, 10), # [0, 256] + 'Contrast': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Sharpness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Brightness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'AutoContrast': [0] * 10, # no range + 'Equalize': [0] * 10, # no range + 'Invert': [0] * 10, # no range + } + + # augmentation operations + # Lambda is not pickleable for DDP + # image_ops = { + # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), + # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), + # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), + # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), + # 'Rotate': lambda image, magnitude: rotate(image, magnitude), + # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), + # 'Invert': lambda image, magnitude: invert(image, magnitude), + # 'Equalize': lambda image, magnitude: equalize(image, magnitude), + # 'Solarize': lambda image, magnitude: solarize(image, magnitude), + # 'Posterize': lambda image, magnitude: posterize(image, magnitude), + # 'Contrast': lambda image, magnitude: contrast(image, magnitude), + # 'Color': lambda image, magnitude: color(image, magnitude), + # 'Brightness': lambda image, magnitude: brightness(image, magnitude), + # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), + # } + image_ops = { + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_relative, + 'TranslateY': translate_y_relative, + 'Rotate': rotate, + 'AutoContrast': auto_contrast, + 'Invert': invert, + 'Equalize': equalize, + 'Solarize': solarize, + 'Posterize': posterize, + 'Contrast': contrast, + 'Color': color, + 'Brightness': brightness, + 'Sharpness': sharpness, + } + + self.prob = prob + self.magnitude = ranges[op_name][magnitude_idx] + self.op = image_ops[op_name] + + def __call__(self, image): + if self.prob > random.random(): + image = self.op(image, self.magnitude) + return image + + +# PIL Image transforms +# https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform +def shear_x(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor) + + +def shear_y(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor) + + +def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor) + + +def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor) + + +def rotate(image, magnitude): + rot = image.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new('RGBA', rot.size, (128,) * 4), + rot).convert(image.mode) + + +def auto_contrast(image, magnitude=None): + return ImageOps.autocontrast(image) + + +def invert(image, magnitude=None): + return ImageOps.invert(image) + + +def equalize(image, magnitude=None): + return ImageOps.equalize(image) + + +def solarize(image, magnitude): + return ImageOps.solarize(image, magnitude) + + +def posterize(image, magnitude): + return ImageOps.posterize(image, magnitude) + + +def contrast(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Contrast(image).enhance(1 + magnitude) + + +def color(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Color(image).enhance(1 + magnitude) + + +def brightness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Brightness(image).enhance(1 + magnitude) + + +def sharpness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Sharpness(image).enhance(1 + magnitude) + diff --git a/image_classification/ResMLP/config.py b/image_classification/ResMLP/config.py index 7e9b16bf..3643d233 100644 --- a/image_classification/ResMLP/config.py +++ b/image_classification/ResMLP/config.py @@ -45,8 +45,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 -_C.MODEL.DROPOUT = 0.1 -_C.MODEL.DROPPATH = 0.1 +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.ATTENTION_DROPOUT = 0.0 +_C.MODEL.DROP_PATH = 0.1 # transformer settings _C.MODEL.MIXER = CN() @@ -58,13 +59,14 @@ _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 _C.TRAIN.NUM_EPOCHS = 300 -_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size -_C.TRAIN.WEIGHT_DECAY = 0.01 #0.3 # 0.0 for finetune -_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune -_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 -_C.TRAIN.END_LR = 1e-5 -_C.TRAIN.GRAD_CLIP = 1.0 -_C.TRAIN.ACCUM_ITER = 2 #1 +_C.TRAIN.WARMUP_EPOCHS = 20 +_C.TRAIN.WEIGHT_DECAY = 0.05 +_C.TRAIN.BASE_LR = 0.001 +_C.TRAIN.WARMUP_START_LR = 5e-7 +_C.TRAIN.END_LR = 5e-6 +_C.TRAIN.GRAD_CLIP = 5.0 +_C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -78,6 +80,24 @@ _C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) # for adamW _C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 +# train augmentation +_C.TRAIN.MIXUP_ALPHA = 0.8 +_C.TRAIN.CUTMIX_ALPHA = 1.0 +_C.TRAIN.CUTMIX_MINMAX = None +_C.TRAIN.MIXUP_PROB = 1.0 +_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 +_C.TRAIN.MIXUP_MODE = 'batch' + +_C.TRAIN.SMOOTHING = 0.1 +_C.TRAIN.COLOR_JITTER = 0.4 +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False + +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' +_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_SPLIT = False + # misc _C.SAVE = "./output" _C.TAG = "default" @@ -120,6 +140,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/ResMLP/datasets.py b/image_classification/ResMLP/datasets.py index 761dd61a..304df9a3 100644 --- a/image_classification/ResMLP/datasets.py +++ b/image_classification/ResMLP/datasets.py @@ -19,8 +19,19 @@ import os import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from transforms import RandomHorizontalFlip +from random_erasing import RandomErasing class ImageNet2012Dataset(Dataset): @@ -81,12 +92,36 @@ def get_train_transforms(config): transforms_train: training transforms """ - transforms_train = transforms.Compose([ + aug_op_list = [] + # STEP1: random crop and resize + aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) + scale=(0.05, 1.0), interpolation='bicubic')) + # STEP2: auto_augment or color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # STEP3: other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # STEP4: random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + # Final: compose transforms and return + transforms_train = transforms.Compose(aug_op_list) return transforms_train @@ -106,7 +141,7 @@ def get_val_transforms(config): scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) transforms_val = transforms.Compose([ - transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image + transforms.Resize(scale_size, interpolation='bicubic'), transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), transforms.ToTensor(), transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), @@ -124,6 +159,7 @@ def get_dataset(config, mode='train'): Returns: dataset: dataset object """ + assert mode in ['train', 'val'] if config.DATA.DATASET == "cifar10": if mode == 'train': diff --git a/image_classification/ResMLP/droppath.py b/image_classification/ResMLP/droppath.py index fcff05e9..c8fe8048 100644 --- a/image_classification/ResMLP/droppath.py +++ b/image_classification/ResMLP/droppath.py @@ -32,6 +32,7 @@ def drop_path(inputs, drop_prob=0., training=False): if drop_prob == 0. or not training: return inputs keep_prob = 1 - drop_prob + keep_prob = paddle.to_tensor(keep_prob) shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) random_tensor = random_tensor.floor() # mask diff --git a/image_classification/ResMLP/losses.py b/image_classification/ResMLP/losses.py new file mode 100644 index 00000000..082467a3 --- /dev/null +++ b/image_classification/ResMLP/losses.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Implement Loss functions """ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class LabelSmoothingCrossEntropyLoss(nn.Layer): + """ cross entropy loss for label smoothing + Args: + smoothing: float, smoothing rate + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, target label with shape [N] + Return: + loss: float, cross entropy loss value + """ + def __init__(self, smoothing=0.1): + super().__init__() + assert 0 <= smoothing < 1.0 + self.smoothing = smoothing + self.confidence = 1 - smoothing + + def forward(self, x, target): + log_probs = F.log_softmax(x) # [N, num_classes] + # target_index is used to get prob for each of the N samples + target_index = paddle.zeros([x.shape[0], 2], dtype='int64') # [N, 2] + target_index[:, 0] = paddle.arange(x.shape[0]) + target_index[:, 1] = target + + nll_loss = -log_probs.gather_nd(index=target_index) # index: [N] + smooth_loss = -log_probs.mean(axis=-1) + loss = self.confidence * nll_loss + self.smoothing * smooth_loss + return loss.mean() + + +class SoftTargetCrossEntropyLoss(nn.Layer): + """ cross entropy loss for soft target + Args: + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, soft target with shape [N, num_classes] + Returns: + loss: float, the mean loss value + """ + def __init__(self): + super().__init__() + + def forward(self, x, target): + loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1) + return loss.mean() + + +class DistillationLoss(nn.Layer): + """Distillation loss function + This layer includes the orginal loss (criterion) and a extra + distillation loss (criterion), which computes the loss with + different type options, between current model and + a teacher model as its supervision. + + Args: + base_criterion: nn.Layer, the original criterion + teacher_model: nn.Layer, the teacher model as supervision + distillation_type: str, one of ['none', 'soft', 'hard'] + alpha: float, ratio of base loss (* (1-alpha)) + and distillation loss( * alpha) + tao: float, temperature in distillation + """ + def __init__(self, + base_criterion, + teacher_model, + distillation_type, + alpha, + tau): + super().__init__() + assert distillation_type in ['none', 'soft', 'hard'] + self.base_criterion = base_criterion + self.teacher_model = teacher_model + self.type = distillation_type + self.alpha = alpha + self.tau = tau + + def forward(self, inputs, outputs, targets): + """ + Args: + inputs: tensor, the orginal model inputs + outputs: tensor, the outputs of the model + outputds_kd: tensor, the distillation outputs of the model, + this is usually obtained by a separate branch + in the last layer of the model + targets: tensor, the labels for the base criterion + """ + outputs, outputs_kd = outputs[0], outputs[1] + base_loss = self.base_criterion(outputs, targets) + if self.type == 'none': + return base_loss + + with paddle.no_grad(): + teacher_outputs = self.teacher_model(inputs) + + if self.type == 'soft': + distillation_loss = F.kl_div( + F.log_softmax(outputs_kd / self.tau, axis=1), + F.log_softmax(teacher_outputs / self.tau, axis=1), + reduction='sum') * (self.tau * self.tau) / outputs_kd.numel() + elif self.type == 'hard': + distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(axis=1)) + + loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha + return loss + + diff --git a/image_classification/ResMLP/main_multi_gpu.py b/image_classification/ResMLP/main_multi_gpu.py index 3b9dd14f..4f83a949 100644 --- a/image_classification/ResMLP/main_multi_gpu.py +++ b/image_classification/ResMLP/main_multi_gpu.py @@ -27,53 +27,53 @@ import paddle.distributed as dist from datasets import get_dataloader from datasets import get_dataset -from resmlp import build_res_mlp as build_model from utils import AverageMeter from utils import WarmupCosineScheduler from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from resmlp import build_res_mlp as build_model -parser = argparse.ArgumentParser('ResMLP') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -arguments = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, arguments) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('ResMLP') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -81,20 +81,28 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + mixup_fn=None, + amp=False, + local_logger=None, + master_logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch debug_steps: int, num of iters to log info, default: 100 accum_iter: int, num of iters for accumulating gradients, default: 1 + mixup_fn: Mixup, mixup instance, default: None amp: bool, if True, use mix precision training, default: False + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: train_loss_meter.avg train_acc_meter.avg @@ -103,6 +111,9 @@ def train(dataloader, model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + master_train_loss_meter = AverageMeter() + master_train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() @@ -110,24 +121,26 @@ def train(dataloader, for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function # Here no division is needed: # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean' - # #loss = loss / accum_iter loss.backward() @@ -136,41 +149,82 @@ def train(dataloader, optimizer.clear_grad() pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) - batch_size = image.shape[0] - train_loss_meter.update(loss.numpy()[0], batch_size) - train_acc_meter.update(acc.numpy()[0], batch_size) + batch_size = paddle.to_tensor(image.shape[0]) - if batch_id % debug_steps == 0: - logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {train_loss_meter.avg:.4f}, " + - f"Avg Acc: {train_acc_meter.avg:.4f}") + # sync from other gpus for overall loss and acc + master_loss = loss.clone() + master_acc = acc.clone() + master_batch_size = batch_size.clone() + dist.all_reduce(master_loss) + dist.all_reduce(master_acc) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc = master_acc / dist.get_world_size() + master_train_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_train_acc_meter.update(master_acc.numpy()[0], master_batch_size.numpy()[0]) - train_time = time.time() - time_st - return train_loss_meter.avg, train_acc_meter.avg, train_time + train_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) + train_acc_meter.update(acc.numpy()[0], batch_size.numpy()[0]) + if batch_id % debug_steps == 0: + if local_logger: + local_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {train_loss_meter.avg:.4f}, " + + f"Avg Acc: {train_acc_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_train_loss_meter.avg:.4f}, " + + f"Avg Acc: {master_train_acc_meter.avg:.4f}") -def validate(dataloader, model, criterion, total_batch, debug_steps=100): + train_time = time.time() - time_st + return (train_loss_meter.avg, + train_acc_meter.avg, + master_train_loss_meter.avg, + master_train_acc_meter.avg, + train_time) + + +def validate(dataloader, + model, + criterion, + total_batch, + debug_steps=100, + local_logger=None, + master_logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + debug_steps: int, num of iters to log info, default: 100 + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + master_val_loss_meter.avg: float, average loss on all processes/gpus + master_val_acc1_meter.avg: float, average top1 accuracy on all processes/gpus + master_val_acc5_meter.avg: float, average top5 accuracy on all processes/gpus + val_time: float, validation time """ model.eval() val_loss_meter = AverageMeter() val_acc1_meter = AverageMeter() val_acc5_meter = AverageMeter() + master_val_loss_meter = AverageMeter() + master_val_acc1_meter = AverageMeter() + master_val_acc5_meter = AverageMeter() time_st = time.time() with paddle.no_grad(): @@ -185,63 +239,140 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1)) acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5) - dist.all_reduce(loss) - dist.all_reduce(acc1) - dist.all_reduce(acc5) - loss = loss / dist.get_world_size() - acc1 = acc1 / dist.get_world_size() - acc5 = acc5 / dist.get_world_size() - batch_size = paddle.to_tensor(image.shape[0]) - dist.all_reduce(batch_size) + + master_loss = loss.clone() + master_acc1 = acc1.clone() + master_acc5 = acc5.clone() + master_batch_size = batch_size.clone() + + dist.all_reduce(master_loss) + dist.all_reduce(master_acc1) + dist.all_reduce(master_acc5) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc1 = master_acc1 / dist.get_world_size() + master_acc5 = master_acc5 / dist.get_world_size() + + master_val_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc1_meter.update(master_acc1.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc5_meter.update(master_acc5.numpy()[0], master_batch_size.numpy()[0]) val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0]) val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0]) if batch_id % debug_steps == 0: - logger.info( - f"Val Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {val_loss_meter.avg:.4f}, " + - f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + - f"Avg Acc@5: {val_acc5_meter.avg:.4f}") - + if local_logger: + local_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {val_acc5_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {master_val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {master_val_acc5_meter.avg:.4f}") val_time = time.time() - time_st - return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time + return (val_loss_meter.avg, + val_acc1_meter.avg, + val_acc5_meter.avg, + master_val_loss_meter.avg, + master_val_acc1_meter.avg, + master_val_acc5_meter.avg, + val_time) def main_worker(*args): - # 0. Preparation + # STEP 0: Preparation + config = args[0] dist.init_parallel_env() last_epoch = config.TRAIN.LAST_EPOCH - world_size = paddle.distributed.get_world_size() - local_rank = paddle.distributed.get_rank() - logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + world_size = dist.get_world_size() + local_rank = dist.get_rank() seed = config.SEED + local_rank paddle.seed(seed) np.random.seed(seed) random.seed(seed) - # 1. Create model + # logger for each process/gpu + local_logger = get_logger( + filename=os.path.join(config.SAVE, 'log_{}.txt'.format(local_rank)), + logger_name='local_logger') + # overall logger + if local_rank == 0: + master_logger = get_logger( + filename=os.path.join(config.SAVE, 'log.txt'), + logger_name='master_logger') + master_logger.info(f'\n{config}') + else: + master_logger = None + local_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + if local_rank == 0: + master_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + + # STEP 1: Create model model = build_model(config) model = paddle.DataParallel(model) - # 2. Create train and val dataloader + + # STEP 2: Create train and val dataloader dataset_train, dataset_val = args[1], args[2] # Create training dataloader if not config.EVAL: dataloader_train = get_dataloader(config, dataset_train, 'train', True) total_batch_train = len(dataloader_train) - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + local_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') if local_rank == 0: - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + master_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') # Create validation dataloader dataloader_val = get_dataloader(config, dataset_val, 'test', True) total_batch_val = len(dataloader_val) - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + local_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') if local_rank == 0: - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define optimizer and lr_scheduler + master_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -263,7 +394,9 @@ def main_worker(*args): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + local_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") if config.TRAIN.OPTIMIZER.NAME == "SGD": @@ -294,77 +427,120 @@ def main_worker(*args): # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 5. Load pretrained model / load resumt model and optimizer states + # STEP 6: Load pretrained model / load resumt model and optimizer states if config.MODEL.PRETRAINED: if (config.MODEL.PRETRAINED).endswith('.pdparams'): raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) - logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + local_logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + if local_rank == 0: + master_logger.info( + f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) - logger.info( - f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}") + local_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + if local_rank == 0: + master_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") - # 6. Validation + # STEP 7: Validation (eval mode) if config.EVAL: - logger.info('----- Start Validating') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info('----- Start Validating') + if local_rank == 0: + master_logger.info('----- Start Validating') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") return - # 6. Start training and validation - logging.info(f"Start training from epoch {last_epoch+1}.") + # STEP 8: Start training and validation (train mode) + local_logger.info(f"Start training from epoch {last_epoch+1}.") + if local_rank == 0: + master_logger.info(f"Start training from epoch {last_epoch+1}.") for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") - train_loss, train_acc, train_time = train(dataloader=dataloader_train, - model=model, - criterion=criterion, - optimizer=optimizer, - epoch=epoch, - total_batch=total_batch_train, - debug_steps=config.REPORT_FREQ, - accum_iter=config.TRAIN.ACCUM_ITER, - amp=config.AMP) + local_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + if local_rank == 0: + master_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + train_loss, train_acc, avg_loss, avg_acc, train_time = train( + dataloader=dataloader_train, + model=model, + criterion=criterion, + optimizer=optimizer, + epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, + total_batch=total_batch_train, + debug_steps=config.REPORT_FREQ, + accum_iter=config.TRAIN.ACCUM_ITER, + mixup_fn=mixup_fn, + amp=config.AMP, + local_logger=local_logger, + master_logger=master_logger) + scheduler.step() - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Train Loss: {train_loss:.4f}, " + - f"Train Acc: {train_acc:.4f}, " + - f"time: {train_time:.2f}") + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {train_loss:.4f}, " + + f"Train Acc: {train_acc:.4f}, " + + f"time: {train_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {avg_loss:.4f}, " + + f"Train Acc: {avg_acc:.4f}, " + + f"time: {train_time:.2f}") + # validation if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: - logger.info(f'----- Validation after Epoch: {epoch}') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info(f'----- Validation after Epoch: {epoch}') + if local_rank == 0: + master_logger.info(f'----- Validation after Epoch: {epoch}') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") # model save if local_rank == 0: if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: @@ -372,18 +548,33 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - logger.info(f"----- Save model: {model_path}.pdparams") - logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") def main(): + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) + + # get dataset and start DDP if not config.EVAL: dataset_train = get_dataset(config, mode='train') else: dataset_train = None dataset_val = get_dataset(config, mode='val') config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS - dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS) + dist.spawn(main_worker, args=(config, dataset_train, dataset_val, ), nprocs=config.NGPUS) if __name__ == "__main__": diff --git a/image_classification/ResMLP/main_single_gpu.py b/image_classification/ResMLP/main_single_gpu.py index 3e0d3624..ded94338 100644 --- a/image_classification/ResMLP/main_single_gpu.py +++ b/image_classification/ResMLP/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,55 +26,54 @@ import paddle.nn.functional as F from datasets import get_dataloader from datasets import get_dataset -from resmlp import build_res_mlp as build_model from utils import AverageMeter from utils import WarmupCosineScheduler +from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from resmlp import build_res_mlp as build_model -parser = argparse.ArgumentParser('ResMLP') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -args = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, args) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -#config.freeze() - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('ResMLP') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -83,49 +81,57 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + mixup_fn=None, + amp=False, + logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info - accum_iter: int, num of iters for accumulating gradients - amp: bool, if True, use mix precision training + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + accum_iter: int, num of iters for accumulating gradients, default: 1 + mixup_fn: Mixup, mixup instance, default: None + amp: bool, if True, use mix precision training, default: False + logger: logger for logging, default: None Returns: - train_loss_meter.avg - train_acc_meter.avg - train_time + train_loss_meter.avg: float, average loss on current process/gpu + train_acc_meter.avg: float, average top1 accuracy on current process/gpu + train_time: float, training time """ model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() - for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function @@ -139,15 +145,18 @@ def train(dataloader, optimizer.clear_grad() pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) batch_size = image.shape[0] train_loss_meter.update(loss.numpy()[0], batch_size) train_acc_meter.update(acc.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + f"Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {train_loss_meter.avg:.4f}, " + f"Avg Acc: {train_acc_meter.avg:.4f}") @@ -156,19 +165,20 @@ def train(dataloader, return train_loss_meter.avg, train_acc_meter.avg, train_time -def validate(dataloader, model, criterion, total_batch, debug_steps=100): +def validate(dataloader, model, criterion, total_batch, debug_steps=100, logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + logger: logger for logging, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + val_time: float, valitaion time """ model.eval() val_loss_meter = AverageMeter() @@ -193,7 +203,7 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): val_acc1_meter.update(acc1.numpy()[0], batch_size) val_acc5_meter.update(acc5.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( f"Val Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {val_loss_meter.avg:.4f}, " + @@ -205,25 +215,77 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): def main(): - # 0. Preparation + # STEP 0: Preparation + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) last_epoch = config.TRAIN.LAST_EPOCH seed = config.SEED paddle.seed(seed) np.random.seed(seed) random.seed(seed) - #paddle.set_device('gpu:0') - # 1. Create model + logger = get_logger(filename=os.path.join(config.SAVE, 'log.txt')) + logger.info(f'\n{config}') + + # STEP 1: Create model model = build_model(config) - # 2. Create train and val dataloader + + # STEP 2: Create train and val dataloader if not config.EVAL: dataset_train = get_dataset(config, mode='train') dataloader_train = get_dataloader(config, dataset_train, 'train', False) dataset_val = get_dataset(config, mode='val') dataloader_val = get_dataloader(config, dataset_val, 'val', False) - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define lr_scheduler + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -232,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -245,9 +306,9 @@ def main(): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") - # 5. Define optimizer + if config.TRAIN.OPTIMIZER.NAME == "SGD": if config.TRAIN.GRAD_CLIP: clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP) @@ -267,18 +328,21 @@ def main(): optimizer = paddle.optimizer.AdamW( parameters=model.parameters(), learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR, - weight_decay=config.TRAIN.WEIGHT_DECAY, beta1=config.TRAIN.OPTIMIZER.BETAS[0], beta2=config.TRAIN.OPTIMIZER.BETAS[1], + weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 6. Load pretrained model or load resume model and optimizer states + + # STEP 6: Load pretrained model or load resume model and optimizer states if config.MODEL.PRETRAINED: - assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') - model_state = paddle.load(config.MODEL.PRETRAINED + '.pdparams') + if (config.MODEL.PRETRAINED).endswith('.pdparams'): + raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') + assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True + model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") @@ -291,35 +355,40 @@ def main(): optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") - # 7. Validation + + # STEP 7: Validation (eval mode) if config.EVAL: logger.info('----- Start Validating') val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + f"Validation Acc@5: {val_acc5:.4f}, " + f"time: {val_time:.2f}") return - # 8. Start training and validation - logging.info(f"Start training from epoch {last_epoch + 1}.") - for epoch in range(last_epoch + 1, config.TRAIN.NUM_EPOCHS + 1): + + # STEP 8: Start training and validation (train mode) + logger.info(f"Start training from epoch {last_epoch+1}.") + for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") train_loss, train_acc, train_time = train(dataloader=dataloader_train, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, total_batch=len(dataloader_train), debug_steps=config.REPORT_FREQ, accum_iter=config.TRAIN.ACCUM_ITER, + mixup_fn=mixup_fn, amp=config.AMP, - ) + logger=logger) scheduler.step() logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Train Loss: {train_loss:.4f}, " + @@ -331,9 +400,10 @@ def main(): val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + diff --git a/image_classification/ResMLP/mixup.py b/image_classification/ResMLP/mixup.py new file mode 100644 index 00000000..1d2db493 --- /dev/null +++ b/image_classification/ResMLP/mixup.py @@ -0,0 +1,225 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""mixup and cutmix for batch data""" +import numpy as np +import paddle + + +def rand_bbox(image_shape, lam, count=None): + """ CutMix bbox by lam value + Generate 1 random bbox by value lam. lam is the cut size rate. + The cut_size is computed by sqrt(1-lam) * image_size. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + count: int, number of bbox to generate + """ + image_h, image_w = image_shape[-2:] + cut_rate = np.sqrt(1. - lam) + cut_h = int(cut_rate * image_h) + cut_w = int(cut_rate * image_w) + + # get random bbox center + cy = np.random.randint(0, image_h, size=count) + cx = np.random.randint(0, image_w, size=count) + + # get bbox coords + bbox_x1 = np.clip(cx - cut_w // 2, 0, image_w) + bbox_y1 = np.clip(cy - cut_h // 2, 0, image_h) + bbox_x2 = np.clip(cx + cut_w // 2, 0, image_w) + bbox_y2 = np.clip(cy + cut_h // 2, 0, image_h) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # while in pytorch, it will return [] tensor + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def rand_bbox_minmax(image_shape, minmax, count=None): + """ CutMix bbox by min and max value + Generate 1 random bbox by min and max percentage values. + Minmax is a tuple/list of min and max percentage vlaues + applied to the image width and height. + + Args: + image_shape: tuple/list, image height and width + minmax: tuple/list, min and max percentage values of image size + count: int, number of bbox to generate + """ + assert len(minmax) == 2 + image_h, image_w = image_shape[-2:] + min_ratio = minmax[0] + max_ratio = minmax[1] + cut_h = np.random.randint(int(image_h * min_ratio), int(image_h * max_ratio), size=count) + cut_w = np.random.randint(int(image_w * min_ratio), int(image_w * max_ratio), size=count) + + bbox_x1 = np.random.randint(0, image_w - cut_w, size=count) + bbox_y1 = np.random.randint(0, image_h - cut_h, size=count) + bbox_x2 = bbox_x1 + cut_w + bbox_y2 = bbox_y1 + cut_h + + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def cutmix_generate_bbox_adjust_lam(image_shape, lam, minmax=None, correct_lam=True, count=None): + """Generate bbox and apply correction for lambda + If the mimmax is None, apply the standard cutmix by lam value, + If the minmax is set, apply the cutmix by min and max percentage values. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + minmax: tuple/list, min and max percentage values of image size + correct_lam: bool, if True, correct the lam value by the generated bbox + count: int, number of bbox to generate + """ + if minmax is not None: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox_minmax(image_shape, minmax, count) + else: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox(image_shape, lam, count) + + if correct_lam or minmax is not None: + image_h, image_w = image_shape[-2:] + bbox_area = (bbox_y2 - bbox_y1) * (bbox_x2 - bbox_x1) + lam = 1. - bbox_area / float(image_h * image_w) + return (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam + + +def one_hot(x, num_classes, on_value=1., off_value=0.): + """ Generate one-hot vector for label smoothing + Args: + x: tensor, contains label/class indices + num_classes: int, num of classes (len of the one-hot vector) + on_value: float, the vector value at label index, default=1. + off_value: float, the vector value at non-label indices, default=0. + Returns: + one_hot: tensor, tensor with on value at label index and off value + at non-label indices. + """ + x = x.reshape_([-1, 1]) + x_smoothed = paddle.full((x.shape[0], num_classes), fill_value=off_value) + for i in range(x.shape[0]): + x_smoothed[i, x[i]] = on_value + return x_smoothed + + +def mixup_one_hot(label, num_classes, lam=1., smoothing=0.): + """ mixup and label smoothing in batch + label smoothing is firstly applied, then + mixup is applied by mixing the bacth and its flip, + with a mixup rate. + + Args: + label: tensor, label tensor with shape [N], contains the class indices + num_classes: int, num of all classes + lam: float, mixup rate, default=1.0 + smoothing: float, label smoothing rate + """ + off_value = smoothing / num_classes + on_value = 1. - smoothing + off_value + y1 = one_hot(label, num_classes, on_value, off_value) + y2 = one_hot(label.flip(axis=[0]), num_classes, on_value, off_value) + return y2 * (1 - lam) + y1 * lam + + +class Mixup: + """Mixup class + Args: + mixup_alpha: float, mixup alpha for beta distribution, default=1.0, + cutmix_alpha: float, cutmix alpha for beta distribution, default=0.0, + cutmix_minmax: list/tuple, min and max value for cutmix ratio, default=None, + prob: float, if random prob < prob, do not use mixup, default=1.0, + switch_prob: float, prob of switching mixup and cutmix, default=0.5, + mode: string, mixup up, now only 'batch' is supported, default='batch', + correct_lam: bool, if True, apply correction of lam, default=True, + label_smoothing: float, label smoothing rate, default=0.1, + num_classes: int, num of classes, default=1000 + """ + def __init__(self, + mixup_alpha=1.0, + cutmix_alpha=0.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode='batch', + correct_lam=True, + label_smoothing=0.1, + num_classes=1000): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if cutmix_minmax is not None: + assert len(cutmix_minmax) == 2 + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = correct_lam + assert mode == 'batch', 'Now only batch mode is supported!' + + def __call__(self, x, target): + assert x.shape[0] % 2 == 0, "Batch size should be even" + lam = self._mix_batch(x) + target = mixup_one_hot(target, self.num_classes, lam, self.label_smoothing) + return x, target + + def get_params(self): + """Decide to use cutmix or regular mixup by sampling and + sample lambda for mixup + """ + lam = 1. + use_cutmix = False + use_mixup = np.random.rand() < self.mix_prob + if use_mixup: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand() < self.switch_prob + alpha = self.cutmix_alpha if use_cutmix else self.mixup_alpha + lam_mix = np.random.beta(alpha, alpha) + elif self.mixup_alpha == 0. and self.cutmix_alpha > 0.: + use_cutmix=True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + elif self.mixup_alpha > 0. and self.cutmix_alpha == 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + else: + raise ValueError('mixup_alpha and cutmix_alpha cannot be all 0') + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_batch(self, x): + """mixup/cutmix by adding batch data and its flipped version""" + lam, use_cutmix = self.get_params() + if lam == 1.: + return lam + if use_cutmix: + (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam = cutmix_generate_bbox_adjust_lam( + x.shape, + lam, + minmax=self.cutmix_minmax, + correct_lam=self.correct_lam) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # but in pytorch, it will return [] tensor without errors + if int(bbox_x1) != int(bbox_x2) and int(bbox_y1) != int(bbox_y2): + x[:, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] = x.flip(axis=[0])[ + :, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] + else: + x_flipped = x.flip(axis=[0]) + x_flipped = x_flipped * (1 - lam) + x.set_value(x * (lam) + x_flipped) + return lam diff --git a/image_classification/ResMLP/random_erasing.py b/image_classification/ResMLP/random_erasing.py new file mode 100644 index 00000000..31eea465 --- /dev/null +++ b/image_classification/ResMLP/random_erasing.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Random Erasing for image tensor""" + +import random +import math +import paddle + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"): + if per_pixel: + return paddle.normal(shape=patch_size).astype(dtype) + if rand_color: + return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype) + return paddle.zeros((patch_size[0], 1, 1)).astype(dtype) + + +class RandomErasing(object): + """ + Args: + prob: probability of performing random erasing + min_area: Minimum percentage of erased area wrt input image area + max_area: Maximum percentage of erased area wrt input image area + min_aspect: Minimum aspect ratio of earsed area + max_aspect: Maximum aspect ratio of earsed area + mode: pixel color mode, in ['const', 'rand', 'pixel'] + 'const' - erase block is constant valued 0 for all channels + 'rand' - erase block is valued random color (same per-channel) + 'pixel' - erase block is vauled random color per pixel + min_count: Minimum # of ereasing blocks per image. + max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count + per-image count is randomly chosen between min_count to max_count + """ + def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0): + self.prob = prob + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == "rand": + self.rand_color = True + elif mode == "pixel": + self.per_pixel = True + else: + assert not mode or mode == "const" + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.prob: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top+h, left:left+w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype) + break + + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) + else: + batch_size, chan, img_h, img_w = input.shape + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input + + + +#def main(): +# re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel') +# import PIL.Image as Image +# import numpy as np +# paddle.set_device('cpu') +# img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32') +# img = img / 255.0 +# img = paddle.transpose(img, [2, 0, 1]) +# new_img = re(img) +# new_img = new_img * 255.0 +# new_img = paddle.transpose(new_img, [1, 2, 0]) +# new_img = new_img.cpu().numpy() +# new_img = Image.fromarray(new_img.astype('uint8')) +# new_img.save('./res.png') +# +# +# +#if __name__ == "__main__": +# main() diff --git a/image_classification/ResMLP/resmlp.py b/image_classification/ResMLP/resmlp.py index 867c0bfd..9ea3f200 100644 --- a/image_classification/ResMLP/resmlp.py +++ b/image_classification/ResMLP/resmlp.py @@ -227,5 +227,5 @@ def build_res_mlp(config): embed_dim=config.MODEL.MIXER.HIDDEN_SIZE, mlp_ratio=4, dropout=config.MODEL.DROPOUT, - droppath=config.MODEL.DROPPATH) + droppath=config.MODEL.DROP_PATH) return model diff --git a/image_classification/ResMLP/transforms.py b/image_classification/ResMLP/transforms.py new file mode 100644 index 00000000..5a046912 --- /dev/null +++ b/image_classification/ResMLP/transforms.py @@ -0,0 +1,14 @@ +import random +import paddle +import paddle.nn +import paddle.vision.transforms as T + + +class RandomHorizontalFlip(): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, image): + if random.random() < self.p: + return T.hflip(image) + return image diff --git a/image_classification/Shuffle_Transformer/.config.py.swp b/image_classification/Shuffle_Transformer/.config.py.swp deleted file mode 100644 index e144ebf1b9870d80e20353db8d33673348ff1f43..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHOU5q4E6)yQx`B4@y(Fl5(A?X?DuI}kwU}s@5wcRx{?RHl;{WHr#w5YDW-Ca;! zRa-whI~YY?NQl9JF;N~g28||?$b(TrV!%Yj7ZXs?Hv>e1F;NmxT}l^AHLv67hSm_NxtibRQyh# z-7}@NLa-YY#+F&Q|7cJ+yH+X{&P)nMxqP@#DwWDIiAcpA*S|mT43if<@2nN-BP$$7 z{bH-NZFP&*#29hiw2dIJhL&lBmg^Kd<7havx!&~+Cm6c^g!@I)b%xeR_IBUNuIzrs zK*m7AKokxON3ZA|F0YlSWO?a&cJ)=gjGl~vjDd`SjDd`SjDd`SjDd`SjDi0n214%w z=|S}Of~3#)CExGa^L=4*eS7k4@425`W?vZt83P#u83P#u83P#u83P#u83P#u83P#u z83X@=3>X7Rx(;@|MhQRO|EKx?&ps$gKLH*B?f|Xaz&Lg06oOVTfamw*?5XMi69-v+({+zZ?Q{P8kLdKP#TxCwac zQc3y?@G9^U@D%Vc-~wIXC~!R>0e^VEB)tK=2s{gLU<1eje|n!J{T%oj@E~w6pa2&G zufJE4eh<6|JP&*acoaATbb;G|Yk^CEKVKqAzX4tbo&ml9+z$NxVo7=$_#W_OAOyAn z8Mqv{4EX0olJq9<9Pl_W0*(QPfY-3m@-%Qia2@btY|Qil1GpPdfiiG8Kx6hQ@JhN_ zbAW&8dPQ!lt!BHcZ?x4r*>QGri7m6C&(E;XWwtvSS*3G&S2ekz`4InS{CT%Nfo<2Ik<7PWUy)w-1} zz141>KqDpjX7tqa-9B@NEZDKoB=yydLTX@V%pKNu!*Sd$Gn_$si#tZ&=7UA$iPkll zXi2pSlgXTUr5wjva85<&wgR?CTcy_p7kZpr8NZof|R3ZEH?Y?*n;JlYtzxMH0#Z_{>fUmtF}cSk3hBQ+8CGntT2QJ4GlAN{X{uL zD%#4bw9JfXG~rGt&Ke0%?Y5PgCO2E%TBCNi+LjwlGN9KGTGy%mUWpTGSLxvPN=aVN z!{^r5*H;gcQ(Gjp#tTOex~nM-F)uosRnewd>D{SInb|S?N#vPsU^#q_qWK=|XGy74dJVl6yG41%-ey~=>Xmw}g({dr zvm2^b)$6ej&6S=~+6@G+PFFz~-GBRp(ov_yIOjI?l2dAJbE~V1;Xz9!xpec~%~Gm%tGI=913#*4?}zh!mnwH*2#CZj@(ixNNUYC zd3D#1#O$0GP+H_i8-|U!jyci~TssOm^T0hF$}?=nCw)E`2<_p-9*Fj(Z|4gfWs7z9*w%Vz(dMH+}%m9oM@uQCBYt@ zsF#E^jDLDk@*o$ewwo;-25g}QBE?LNg5arvc}*vgW{)C29Cjvfo?;r53cRu-<~t64 z&Q)o(RS{>@DwHj{LtE|iM95n#PG@^^?sM+grgJc2M-+6sw}xVt$IQP z4)R-F-B#*o5^IUI66_Ppx!6n5L53JT0t=h1-T`(0n`)3v+**vtLP(>ii~INFax}A3 zNy(act{<{pV`57~A8VFf!wls3%rM4C+^h^o8oYw$dPPfXxPi2^w1g#eMVv2Yrh(-l zC&yv^T_%%4Hnh+!bPxrGKf)3<;QoqjO{@@u8wEJPyqZ*x!~)J?Ljg3s%otH1YAc!-Jw*!;EgWS*58}YZMr8 zwZRnaG&_m#26M$Km+^C+&x3$i0lVboWYEA0Lf`7+u>oH8TFfvb<^@~_2CyYO6>6O& zhH0 z_i3OQ>|-lbeJQE5w0c|8#|kX;tevi_kV?4{{`S1z!!nf0}le912kYAI0U?m{QpVdOMngJfR6yL zBiDZ(cnbI~@CeWc?gDNEJ_h_7dHz3uH-PU0Uk4rmJ`HRD9|Hb`-2WHA4}gb(0JszQ z1aJfJCi4B?0^b521BQSGTnl^@xCHnc`u11gE#TL{6Tm}&2OI~e?>7S3Cu1OEAY&k7 zAY&k7AY)*W0kMwsIacnpqNAT6N}>pV^vJ^+A3t6c#X6-OI;{a$;(|Ogha*w2Dym>( zHZ3?N3Su#&Fi7f58`~z>oP=72T%a8{h!rT6(-Jfh#@d$>RP5x794ARmD1*VaJR9*a zv1TQ0>NsWEtH;nOQoX+;%duh^uLk%7WPy+)+2lsDWjMb9g(N7D@r%t+=bmehMGf?B zPjTGIIZbgIKivWl;z%MQq^Ts8YKV)@Q+r`=L1;i8B9bX=+|O&774Op|66pO|t4pX$TWCRgEVhvcFyMLTp4~Ejo(kt#h!(j z*M`YAd^SR_qmp{l3olf zpSGN4+C%_cRC3e;E#vyntNE(OwW<3({T?m=e|T zCl!Um(!Vmr%N4xoL}bphIxe_>T-?}*Y&*V=jTBkB>v3n=1-yFnbE~vr6xWX%iCK_m zu1TTyygf@fgYoghfMPKsei|YQVmG$l#P&Hh!L^exTkNGK@h07m_oH~rFcKR=;tdUH d>8N6tRn)-dTn*Fg;5WGgmmyGDm}78M=|2aa<-Py_ diff --git a/image_classification/Shuffle_Transformer/augment.py b/image_classification/Shuffle_Transformer/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/Shuffle_Transformer/augment.py +++ b/image_classification/Shuffle_Transformer/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/Shuffle_Transformer/config.py b/image_classification/Shuffle_Transformer/config.py index 9444e778..8910dcdf 100644 --- a/image_classification/Shuffle_Transformer/config.py +++ b/image_classification/Shuffle_Transformer/config.py @@ -70,11 +70,12 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 20 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.BASE_LR = 0.001 _C.TRAIN.WARMUP_START_LR = 5e-7 _C.TRAIN.END_LR = 5e-6 _C.TRAIN.GRAD_CLIP = 5.0 _C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -98,34 +99,21 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" _C.SAVE_FREQ = 1 # freq to save chpt _C.REPORT_FREQ = 50 # freq to logging info _C.VALIDATE_FREQ = 10 # freq to do validation -_C.SEED = 0 +_C.SEED = 42 _C.EVAL = False # run evaluation only _C.AMP = False _C.LOCAL_RANK = 0 @@ -145,6 +133,7 @@ def _update_config_from_file(config, cfg_file): config.merge_from_file(cfg_file) config.freeze() + def update_config(config, args): """Update config by ArgumentParser Args: diff --git a/image_classification/Shuffle_Transformer/datasets.py b/image_classification/Shuffle_Transformer/datasets.py index b120fa00..6406193a 100644 --- a/image_classification/Shuffle_Transformer/datasets.py +++ b/image_classification/Shuffle_Transformer/datasets.py @@ -27,6 +27,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -99,9 +101,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/Shuffle_Transformer/main_multi_gpu.py b/image_classification/Shuffle_Transformer/main_multi_gpu.py index 98b9a762..28b17520 100644 --- a/image_classification/Shuffle_Transformer/main_multi_gpu.py +++ b/image_classification/Shuffle_Transformer/main_multi_gpu.py @@ -35,7 +35,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from shuffle_transformer import build_shuffle_transformer as build_model @@ -132,7 +131,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -358,22 +357,23 @@ def main_worker(*args): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -447,9 +447,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) @@ -550,11 +550,8 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - local_logger.info(f"----- Save model: {model_path}.pdparams") - local_logger.info(f"----- Save optim: {model_path}.pdopt") - if local_rank == 0: - master_logger.info(f"----- Save model: {model_path}.pdparams") - master_logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") def main(): diff --git a/image_classification/Shuffle_Transformer/main_single_gpu.py b/image_classification/Shuffle_Transformer/main_single_gpu.py index 5953f224..87fb4feb 100644 --- a/image_classification/Shuffle_Transformer/main_single_gpu.py +++ b/image_classification/Shuffle_Transformer/main_single_gpu.py @@ -35,7 +35,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from shuffle_transformer import build_shuffle_transformer as build_model @@ -126,7 +125,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -269,19 +268,23 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -291,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -348,9 +350,9 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/Shuffle_Transformer/run_train_multi.sh b/image_classification/Shuffle_Transformer/run_train_multi.sh index 87e12acc..679fd0b2 100644 --- a/image_classification/Shuffle_Transformer/run_train_multi.sh +++ b/image_classification/Shuffle_Transformer/run_train_multi.sh @@ -4,4 +4,4 @@ python main_multi_gpu.py \ -dataset='imagenet2012' \ -batch_size=8 \ -data_path='/dataset/imagenet' \ --amp +#-amp diff --git a/image_classification/Shuffle_Transformer/shuffle_transformer.py b/image_classification/Shuffle_Transformer/shuffle_transformer.py index e9253b00..6f6287fd 100644 --- a/image_classification/Shuffle_Transformer/shuffle_transformer.py +++ b/image_classification/Shuffle_Transformer/shuffle_transformer.py @@ -70,8 +70,8 @@ def __init__(self, self.num_patches = (image_size // 4) * (image_size // 4) def _init_weights_batchnorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, inputs): @@ -527,6 +527,7 @@ def build_shuffle_transformer(config): """ build shuffle transformer using config""" model = ShuffleTransformer(image_size=config.DATA.IMAGE_SIZE, embed_dim=config.MODEL.TRANS.EMBED_DIM, + num_classes=config.MODEL.NUM_CLASSES, mlp_ratio=config.MODEL.TRANS.MLP_RATIO, layers=config.MODEL.TRANS.DEPTHS, num_heads=config.MODEL.TRANS.NUM_HEADS, diff --git a/image_classification/SwinTransformer/augment.py b/image_classification/SwinTransformer/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/SwinTransformer/augment.py +++ b/image_classification/SwinTransformer/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/SwinTransformer/config.py b/image_classification/SwinTransformer/config.py index 232771fb..6a041129 100644 --- a/image_classification/SwinTransformer/config.py +++ b/image_classification/SwinTransformer/config.py @@ -70,11 +70,12 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 20 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 5e-4 +_C.TRAIN.BASE_LR = 0.001 _C.TRAIN.WARMUP_START_LR = 5e-7 _C.TRAIN.END_LR = 5e-6 _C.TRAIN.GRAD_CLIP = 5.0 _C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -98,27 +99,14 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' - # misc _C.SAVE = "./output" _C.TAG = "default" @@ -145,6 +133,7 @@ def _update_config_from_file(config, cfg_file): config.merge_from_file(cfg_file) config.freeze() + def update_config(config, args): """Update config by ArgumentParser Args: @@ -161,6 +150,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/SwinTransformer/datasets.py b/image_classification/SwinTransformer/datasets.py index 5b862955..304df9a3 100644 --- a/image_classification/SwinTransformer/datasets.py +++ b/image_classification/SwinTransformer/datasets.py @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -99,6 +102,10 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 aug_op_list.append(transforms.ColorJitter(*jitter)) diff --git a/image_classification/SwinTransformer/main_multi_gpu.py b/image_classification/SwinTransformer/main_multi_gpu.py index a98708e1..66de5514 100644 --- a/image_classification/SwinTransformer/main_multi_gpu.py +++ b/image_classification/SwinTransformer/main_multi_gpu.py @@ -35,7 +35,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from swin_transformer import build_swin as build_model @@ -49,6 +48,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -105,11 +105,9 @@ def train(dataloader, local_logger: logger for local process/gpu, default: None master_logger: logger for main process, default: None Returns: - train_loss_meter.avg: float, average loss on current process/gpu - train_acc_meter.avg: float, average top1 accuracy on current process/gpu - master_train_loss_meter.avg: float, average loss on all processes/gpus - master_train_acc_meter.avg: float, average top1 accuracy on all processes/gpus - train_time: float, training time + train_loss_meter.avg + train_acc_meter.avg + train_time """ model.train() train_loss_meter = AverageMeter() @@ -132,7 +130,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -358,22 +356,23 @@ def main_worker(*args): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * - config.DATA.BATCH_SIZE * dist.get_world_size()) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -447,9 +446,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/SwinTransformer/main_single_gpu.py b/image_classification/SwinTransformer/main_single_gpu.py index e7580764..922bee47 100644 --- a/image_classification/SwinTransformer/main_single_gpu.py +++ b/image_classification/SwinTransformer/main_single_gpu.py @@ -1,4 +1,3 @@ - # Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -35,7 +34,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from swin_transformer import build_swin as build_model @@ -49,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) @@ -126,7 +125,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -269,19 +268,23 @@ def main(): criterion_val = nn.CrossEntropyLoss() # STEP 5: Define optimizer and lr_scheduler - # set lr according to batch size and world size (hacked from official code) - linear_scaled_lr = (config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_warmup_start_lr = (config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / 512.0 - linear_scaled_end_lr = (config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / 512.0 - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": @@ -291,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -348,11 +350,11 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") diff --git a/image_classification/SwinTransformer/run_train_multi.sh b/image_classification/SwinTransformer/run_train_multi.sh index 3722db67..bb9b7f5b 100644 --- a/image_classification/SwinTransformer/run_train_multi.sh +++ b/image_classification/SwinTransformer/run_train_multi.sh @@ -1,7 +1,7 @@ -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ python main_multi_gpu.py \ -cfg='./configs/swin_tiny_patch4_window7_224.yaml' \ -dataset='imagenet2012' \ --batch_size=100 \ +-batch_size=16 \ -data_path='/dataset/imagenet' \ --amp +#-amp diff --git a/image_classification/SwinTransformer/swin_transformer.py b/image_classification/SwinTransformer/swin_transformer.py index c7b0c16c..2dde9459 100644 --- a/image_classification/SwinTransformer/swin_transformer.py +++ b/image_classification/SwinTransformer/swin_transformer.py @@ -68,8 +68,8 @@ def __init__(self, image_size=224, patch_size=4, in_channels=3, embed_dim=96): bias_attr=b_attr) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -110,13 +110,13 @@ def __init__(self, input_resolution, dim): bias_attr=b_attr_2) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -169,7 +169,7 @@ def __init__(self, in_features, hidden_features, dropout): def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -250,7 +250,7 @@ def __init__(self, def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def transpose_multihead(self, x): @@ -430,8 +430,8 @@ def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0 self.register_buffer("attn_mask", attn_mask) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -633,13 +633,13 @@ def __init__(self, bias_attr=b_attr_2) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward_features(self, x): diff --git a/image_classification/T2T_ViT/augment.py b/image_classification/T2T_ViT/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/T2T_ViT/augment.py +++ b/image_classification/T2T_ViT/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/T2T_ViT/config.py b/image_classification/T2T_ViT/config.py index 51ee65bd..c506e56c 100644 --- a/image_classification/T2T_ViT/config.py +++ b/image_classification/T2T_ViT/config.py @@ -62,16 +62,17 @@ # training settings _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 -_C.TRAIN.NUM_EPOCHS = 300 -_C.TRAIN.WARMUP_EPOCHS = 10 -_C.TRAIN.WEIGHT_DECAY = 0.03 -_C.TRAIN.BASE_LR = 0.003 +_C.TRAIN.NUM_EPOCHS = 310 +_C.TRAIN.WARMUP_EPOCHS = 5 +_C.TRAIN.WEIGHT_DECAY = 3e-2 +_C.TRAIN.BASE_LR = 1e-3 _C.TRAIN.WARMUP_START_LR = 1e-6 _C.TRAIN.END_LR = 1e-5 _C.TRAIN.GRAD_CLIP = None _C.TRAIN.ACCUM_ITER = 1 _C.TRAIN.MODEL_EMA = True _C.TRAIN.MODEL_EMA_DECAY = 0.99996 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -86,35 +87,23 @@ _C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 # train augmentation -_C.TRAIN.MIXUP_ALPHA = 0.8 -_C.TRAIN.CUTMIX_ALPHA = 1.0 -_C.TRAIN.CUTMIX_MINMAX = None -_C.TRAIN.MIXUP_PROB = 1.0 -_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 -_C.TRAIN.MIXUP_MODE = 'batch' +_C.TRAIN.MIXUP_ALPHA = 0.8 # mixup alpha, enabled if >0 +_C.TRAIN.CUTMIX_ALPHA = 1.0 # cutmix alpha, enabled if >0 +_C.TRAIN.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha +_C.TRAIN.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled +_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled +_C.TRAIN.MIXUP_MODE = 'batch' # how to apply mixup/cutmix params, per 'batch', 'pair' or 'elem' _C.TRAIN.SMOOTHING = 0.1 -_C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.COLOR_JITTER = 0.4 # color jitter factor +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False -_C.TRAIN.RANDOM_ERASE_PROB = 0.25 -_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' -_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 # random erase prob +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # random erase mode +_C.TRAIN.RANDOM_ERASE_COUNT = 1 # random erase count _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" diff --git a/image_classification/T2T_ViT/datasets.py b/image_classification/T2T_ViT/datasets.py index b120fa00..7e178b57 100644 --- a/image_classification/T2T_ViT/datasets.py +++ b/image_classification/T2T_ViT/datasets.py @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -69,7 +72,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') + data = Image.open(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -99,9 +102,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/T2T_ViT/main_single_gpu.py b/image_classification/T2T_ViT/main_single_gpu.py index ab0c96ec..4ba9753b 100644 --- a/image_classification/T2T_ViT/main_single_gpu.py +++ b/image_classification/T2T_ViT/main_single_gpu.py @@ -18,6 +18,7 @@ import os import time import logging +import copy import argparse import random import numpy as np diff --git a/image_classification/T2T_ViT/run_train_multi.sh b/image_classification/T2T_ViT/run_train_multi.sh index 3ee14a05..c6d502b5 100644 --- a/image_classification/T2T_ViT/run_train_multi.sh +++ b/image_classification/T2T_ViT/run_train_multi.sh @@ -4,4 +4,4 @@ python main_multi_gpu.py \ -dataset='imagenet2012' \ -batch_size=16 \ -data_path='/dataset/imagenet' \ --amp +#-amp diff --git a/image_classification/T2T_ViT/t2t_vit.py b/image_classification/T2T_ViT/t2t_vit.py index cbf723d2..549d13c8 100644 --- a/image_classification/T2T_ViT/t2t_vit.py +++ b/image_classification/T2T_ViT/t2t_vit.py @@ -130,7 +130,7 @@ def __init__(self, def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -196,7 +196,7 @@ def __init__(self, in_features, hidden_features=None, out_features=None, dropout def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -256,7 +256,7 @@ def __init__(self, def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def transpose_multihead(self, x): @@ -335,8 +335,8 @@ def __init__(self, dropout=dropout) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -411,13 +411,13 @@ def __init__(self, dim, in_dim, num_heads=1, kernel_ratio=0.5, dropout=0.1): default_initializer=nn.initializer.Assign(self.w / math.sqrt(self.m))) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def _init_weights(self): weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr # paddle version 2.1 does not support einsum @@ -512,8 +512,8 @@ def __init__(self, dropout=dropout) def _init_weights_layernorm(self): - weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1)) - bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0)) + weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(1.0)) + bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(0.0)) return weight_attr, bias_attr def forward(self, x): @@ -641,11 +641,17 @@ def forward(self, x): def build_t2t_vit(config): """build t2t-vit model using config""" model = T2TViT(image_size=config.DATA.IMAGE_SIZE, + in_channels=3, + num_classes=config.MODEL.NUM_CLASSES, token_type=config.MODEL.TRANS.TOKEN_TYPE, embed_dim=config.MODEL.TRANS.EMBED_DIM, depth=config.MODEL.TRANS.DEPTH, num_heads=config.MODEL.TRANS.NUM_HEADS, mlp_ratio=config.MODEL.TRANS.MLP_RATIO, qk_scale=config.MODEL.TRANS.QK_SCALE, - qkv_bias=config.MODEL.TRANS.QKV_BIAS) + qkv_bias=config.MODEL.TRANS.QKV_BIAS, + dropout=config.MODEL.DROPOUT, + attention_dropout=config.MODEL.ATTENTION_DROPOUT, + droppath=config.MODEL.DROPPATH, + token_dim=64) return model diff --git a/image_classification/T2T_ViT/utils.py b/image_classification/T2T_ViT/utils.py index 3f020435..24313440 100644 --- a/image_classification/T2T_ViT/utils.py +++ b/image_classification/T2T_ViT/utils.py @@ -128,7 +128,8 @@ def orthogonal(t, gain=1.): gain = paddle.to_tensor(gain) rows = t.shape[0] - cols = t.numel() // rows + cols = np.size(t) // rows + #cols = paddle.numel(t) // rows flattened = paddle.normal(0, 1, [rows, cols]) if rows < cols: diff --git a/image_classification/ViP/augment.py b/image_classification/ViP/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/ViP/augment.py +++ b/image_classification/ViP/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/ViP/config.py b/image_classification/ViP/config.py index 30a2d4c9..e4b47b1c 100644 --- a/image_classification/ViP/config.py +++ b/image_classification/ViP/config.py @@ -63,7 +63,7 @@ _C.TRAIN.NUM_EPOCHS = 300 _C.TRAIN.WARMUP_EPOCHS = 20 _C.TRAIN.WEIGHT_DECAY = 0.05 -_C.TRAIN.BASE_LR = 2e-3 +_C.TRAIN.BASE_LR = 1e-3 _C.TRAIN.WARMUP_START_LR = 1e-6 _C.TRAIN.END_LR = 5e-6 _C.TRAIN.GRAD_CLIP = 5.0 @@ -93,25 +93,13 @@ _C.TRAIN.SMOOTHING = 0.1 _C.TRAIN.COLOR_JITTER = 0.4 _C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False #'rand-m9-mstd0.5-inc1' _C.TRAIN.RANDOM_ERASE_PROB = 0.25 _C.TRAIN.RANDOM_ERASE_MODE = 'pixel' _C.TRAIN.RANDOM_ERASE_COUNT = 1 _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" diff --git a/image_classification/ViP/datasets.py b/image_classification/ViP/datasets.py index 064faebe..9b8cbd2d 100644 --- a/image_classification/ViP/datasets.py +++ b/image_classification/ViP/datasets.py @@ -19,6 +19,7 @@ import os import math +from PIL import Image from paddle.io import Dataset from paddle.io import DataLoader from paddle.io import DistributedBatchSampler @@ -27,6 +28,8 @@ from paddle.vision import image_load from augment import auto_augment_policy_original from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment from transforms import RandomHorizontalFlip from random_erasing import RandomErasing @@ -68,7 +71,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') + data = Image.open(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -98,9 +101,13 @@ def get_train_transforms(config): policy = auto_augment_policy_original() auto_augment = AutoAugment(policy) aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) else: - jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 - aug_op_list.append(transforms.ColorJitter(jitter)) + jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) # STEP3: other ops aug_op_list.append(transforms.ToTensor()) aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, diff --git a/image_classification/ViP/main_multi_gpu.py b/image_classification/ViP/main_multi_gpu.py index 5aca9f2e..30384fc0 100644 --- a/image_classification/ViP/main_multi_gpu.py +++ b/image_classification/ViP/main_multi_gpu.py @@ -132,7 +132,7 @@ def train(dataloader, if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) - loss = criterion(image, output, label) + loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): @@ -376,15 +376,6 @@ def main_worker(*args): config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr config.TRAIN.END_LR = linear_scaled_end_lr - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr - scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -457,9 +448,9 @@ def main_worker(*args): f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) diff --git a/image_classification/ViP/main_single_gpu.py b/image_classification/ViP/main_single_gpu.py index 843ae6d8..dc88e244 100644 --- a/image_classification/ViP/main_single_gpu.py +++ b/image_classification/ViP/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +18,7 @@ import os import time import logging +import copy import argparse import random import numpy as np @@ -286,16 +286,6 @@ def main(): config.TRAIN.BASE_LR = linear_scaled_lr config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr config.TRAIN.END_LR = linear_scaled_end_lr - - if config.TRAIN.ACCUM_ITER > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER - linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER - linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER - - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr - config.TRAIN.END_LR = linear_scaled_end_lr - scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -361,11 +351,11 @@ def main(): logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) - opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + opt_state = paddle.load(config.MODEL.RESUME + '.pdopt') optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") diff --git a/image_classification/ViT/config.py b/image_classification/ViT/config.py index a5da5aad..67eba161 100644 --- a/image_classification/ViT/config.py +++ b/image_classification/ViT/config.py @@ -13,12 +13,10 @@ # limitations under the License. """Configuration - Configuration for data, model archtecture, and training, etc. Config can be set by .yaml file or by argparser(limited usage) - - """ + import os from yacs.config import CfgNode as CN import yaml @@ -43,12 +41,12 @@ _C.MODEL = CN() _C.MODEL.TYPE = 'ViT' _C.MODEL.NAME = 'ViT' -_C.MODEL.RESUME = None -_C.MODEL.PRETRAINED = None -_C.MODEL.NUM_CLASSES = 1000 -_C.MODEL.DROPOUT = 0.1 -_C.MODEL.DROPPATH = 0.1 -_C.MODEL.ATTENTION_DROPOUT = 0.1 +_C.MODEL.RESUME = None # model path for resume training +_C.MODEL.PRETRAINED = None # model path for loading pretrained weights +_C.MODEL.NUM_CLASSES = 1000 # num of classes +_C.MODEL.DROPOUT = 0.1 # dropout rate +_C.MODEL.DROPPATH = 0.1 # drop path rate +_C.MODEL.ATTENTION_DROPOUT = 0.1 # dropout rate for attention # transformer settings _C.MODEL.TRANS = CN() @@ -62,15 +60,15 @@ # training settings _C.TRAIN = CN() -_C.TRAIN.LAST_EPOCH = 0 -_C.TRAIN.NUM_EPOCHS = 300 +_C.TRAIN.LAST_EPOCH = 0 # set this for resuming training +_C.TRAIN.NUM_EPOCHS = 300 # total num of epochs _C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size _C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune -_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune +_C.TRAIN.BASE_LR = 0.003 #0.003 for pretrain # 0.03 for finetune _C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 -_C.TRAIN.END_LR = 5e-4 +_C.TRAIN.END_LR = 5e-4 # ending lr _C.TRAIN.GRAD_CLIP = 1.0 -_C.TRAIN.ACCUM_ITER = 2 #1 +_C.TRAIN.ACCUM_ITER = 1 _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -90,11 +88,11 @@ _C.SAVE_FREQ = 10 # freq to save chpt _C.REPORT_FREQ = 100 # freq to logging info _C.VALIDATE_FREQ = 100 # freq to do validation -_C.SEED = 0 +_C.SEED = 0 # random seed for paddle, numpy and python _C.EVAL = False # run evaluation only _C.AMP = False # mix precision training _C.LOCAL_RANK = 0 -_C.NGPUS = -1 +_C.NGPUS = -1 # usually set to -1, use CUDA_VISIBLE_DEVICES for GPU selections def _update_config_from_file(config, cfg_file): @@ -147,7 +145,6 @@ def update_config(config, args): config.AMP = False else: config.AMP = True - #config.freeze() return config diff --git a/image_classification/ViT/configs/vit_base_patch16_224.yaml b/image_classification/ViT/configs/vit_base_patch16_224.yaml index eff0fc29..82408aec 100644 --- a/image_classification/ViT/configs/vit_base_patch16_224.yaml +++ b/image_classification/ViT/configs/vit_base_patch16_224.yaml @@ -18,4 +18,4 @@ TRAIN: BASE_LR: 0.003 WARMUP_START_LR: 1e-6 END_LR: 5e-4 - ACCUM_ITER: 2 + ACCUM_ITER: 1 diff --git a/image_classification/ViT/configs/vit_base_patch16_384.yaml b/image_classification/ViT/configs/vit_base_patch16_384.yaml index 04cdfaee..cd449950 100644 --- a/image_classification/ViT/configs/vit_base_patch16_384.yaml +++ b/image_classification/ViT/configs/vit_base_patch16_384.yaml @@ -11,4 +11,3 @@ MODEL: DEPTH: 12 NUM_HEADS: 12 QKV_BIAS: true - diff --git a/image_classification/ViT/main_multi_gpu.py b/image_classification/ViT/main_multi_gpu.py index 1c12c029..5377e0fc 100644 --- a/image_classification/ViT/main_multi_gpu.py +++ b/image_classification/ViT/main_multi_gpu.py @@ -36,7 +36,7 @@ def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('ViT') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) diff --git a/image_classification/ViT/main_single_gpu.py b/image_classification/ViT/main_single_gpu.py index cf444df0..692e703f 100644 --- a/image_classification/ViT/main_single_gpu.py +++ b/image_classification/ViT/main_single_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,7 +35,7 @@ def get_arguments(): """return argumeents, this will overwrite the config after loading yaml file""" - parser = argparse.ArgumentParser('Swin') + parser = argparse.ArgumentParser('ViT') parser.add_argument('-cfg', type=str, default=None) parser.add_argument('-dataset', type=str, default=None) parser.add_argument('-batch_size', type=int, default=None) diff --git a/image_classification/XCiT/augment.py b/image_classification/XCiT/augment.py index 19276756..7a7f081c 100644 --- a/image_classification/XCiT/augment.py +++ b/image_classification/XCiT/augment.py @@ -58,7 +58,7 @@ def auto_augment_policy_original(): return policy -def rand_augment_policy_original(magnitude_idx): +def rand_augment_policy_original(magnitude_idx=9): """ 14 types of augment policies in original paper Args: @@ -112,7 +112,7 @@ class RandAugment(): transformed_image = augment(image) """ - def __init__(self, policy, num_layers): + def __init__(self, policy, num_layers=2): """ Args: policy: list of SubPolicy diff --git a/image_classification/XCiT/config.py b/image_classification/XCiT/config.py index 91d13dc7..dddb4f1a 100644 --- a/image_classification/XCiT/config.py +++ b/image_classification/XCiT/config.py @@ -62,7 +62,7 @@ # training settings _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 -_C.TRAIN.NUM_EPOCHS = 300 +_C.TRAIN.NUM_EPOCHS = 400 _C.TRAIN.WARMUP_EPOCHS = 20 _C.TRAIN.WEIGHT_DECAY = 0.05 _C.TRAIN.BASE_LR = 0.001 @@ -93,27 +93,15 @@ _C.TRAIN.MIXUP_MODE = 'batch' _C.TRAIN.SMOOTHING = 0.1 -_C.TRAIN.COLOR_JITTER = 0.4 -_C.TRAIN.AUTO_AUGMENT = True #'rand-m9-mstd0.5-inc1' +_C.TRAIN.COLOR_JITTER = 0.4 # color jitter factor +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = True -_C.TRAIN.RANDOM_ERASE_PROB = 0.25 -_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' -_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 # random erase prob +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' # random erase mode +_C.TRAIN.RANDOM_ERASE_COUNT = 1 # random erase count _C.TRAIN.RANDOM_ERASE_SPLIT = False -# augmentation -_C.AUG = CN() -_C.AUG.COLOR_JITTER = 0.4 # color jitter factor -_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1' -_C.AUG.RE_PROB = 0.25 # random earse prob -_C.AUG.RE_MODE = 'pixel' # random earse mode -_C.AUG.RE_COUNT = 1 # random earse count -_C.AUG.MIXUP = 0.8 # mixup alpha, enabled if >0 -_C.AUG.CUTMIX = 1.0 # cutmix alpha, enabled if >0 -_C.AUG.CUTMIX_MINMAX = None # cutmix min/max ratio, overrides alpha -_C.AUG.MIXUP_PROB = 1.0 # prob of mixup or cutmix when either/both is enabled -_C.AUG.MIXUP_SWITCH_PROB = 0.5 # prob of switching cutmix when both mixup and cutmix enabled -_C.AUG.MIXUP_MODE = 'batch' #how to apply mixup/curmix params, per 'batch', 'pair', or 'elem' # misc _C.SAVE = "./output" diff --git a/image_classification/XCiT/datasets.py b/image_classification/XCiT/datasets.py index 6eb1bd62..052de4ef 100644 --- a/image_classification/XCiT/datasets.py +++ b/image_classification/XCiT/datasets.py @@ -19,8 +19,19 @@ import os import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from random_erasing import RandomErasing + class ImageNet2012Dataset(Dataset): """Build ImageNet2012 dataset @@ -60,7 +71,7 @@ def __len__(self): return len(self.label_list) def __getitem__(self, index): - data = image_load(self.img_path_list[index]).convert('RGB') + data = Image.open(self.img_path_list[index]).convert('RGB') data = self.transform(data) label = self.label_list[index] @@ -79,13 +90,36 @@ def get_train_transforms(config): Returns: transforms_train: training transforms """ - - transforms_train = transforms.Compose([ + aug_op_list = [] + # random crop and resize + aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) + scale=(0.05, 1.0))) + # auto_augment / color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER),) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + + transforms_train = transforms.Compose(aug_op_list) return transforms_train @@ -129,11 +163,13 @@ def get_dataset(config, mode='train'): if mode == 'train': dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config)) else: + mode = 'test' dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config)) elif config.DATA.DATASET == "cifar100": if mode == 'train': dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config)) else: + mode = 'test' dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config)) elif config.DATA.DATASET == "imagenet2012": if mode == 'train': diff --git a/image_classification/XCiT/main_multi_gpu.py b/image_classification/XCiT/main_multi_gpu.py index 3faa6b99..b817e9cf 100644 --- a/image_classification/XCiT/main_multi_gpu.py +++ b/image_classification/XCiT/main_multi_gpu.py @@ -35,7 +35,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from xcit import build_xcit as build_model @@ -344,7 +343,8 @@ def main_worker(*args): prob=config.TRAIN.MIXUP_PROB, switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, mode=config.TRAIN.MIXUP_MODE, - label_smoothing=config.TRAIN.SMOOTHING) + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) # STEP 4: Define criterion if config.TRAIN.MIXUP_PROB > 0.: @@ -550,11 +550,8 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - local_logger.info(f"----- Save model: {model_path}.pdparams") - local_logger.info(f"----- Save optim: {model_path}.pdopt") - if local_rank == 0: - master_logger.info(f"----- Save model: {model_path}.pdparams") - master_logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") def main(): diff --git a/image_classification/XCiT/main_single_gpu.py b/image_classification/XCiT/main_single_gpu.py index 58b049b2..00aa8821 100644 --- a/image_classification/XCiT/main_single_gpu.py +++ b/image_classification/XCiT/main_single_gpu.py @@ -1,4 +1,3 @@ - # Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -35,7 +34,6 @@ from mixup import Mixup from losses import LabelSmoothingCrossEntropyLoss from losses import SoftTargetCrossEntropyLoss -from losses import DistillationLoss from xcit import build_xcit as build_model @@ -255,7 +253,8 @@ def main(): prob=config.TRAIN.MIXUP_PROB, switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, mode=config.TRAIN.MIXUP_MODE, - label_smoothing=config.TRAIN.SMOOTHING) + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) # STEP 4: Define criterion if config.TRAIN.MIXUP_PROB > 0.: diff --git a/image_classification/XCiT/run_train.sh b/image_classification/XCiT/run_train.sh index ae378344..b6badd06 100644 --- a/image_classification/XCiT/run_train.sh +++ b/image_classification/XCiT/run_train.sh @@ -1,5 +1,5 @@ -CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python main_multi_gpu.py \ +CUDA_VISIBLE_DEVICES=0 \ +python main_single_gpu.py \ -cfg='./configs/xcit_nano_12_p8_224.yaml' \ -dataset='imagenet2012' \ -batch_size=8 \ diff --git a/image_classification/XCiT/run_train_multi.sh b/image_classification/XCiT/run_train_multi.sh new file mode 100644 index 00000000..ae378344 --- /dev/null +++ b/image_classification/XCiT/run_train_multi.sh @@ -0,0 +1,6 @@ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +python main_multi_gpu.py \ +-cfg='./configs/xcit_nano_12_p8_224.yaml' \ +-dataset='imagenet2012' \ +-batch_size=8 \ +-data_path='/dataset/imagenet' \ diff --git a/image_classification/gMLP/augment.py b/image_classification/gMLP/augment.py new file mode 100644 index 00000000..7a7f081c --- /dev/null +++ b/image_classification/gMLP/augment.py @@ -0,0 +1,285 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Augmentation""" +""" Rand Augmentation """ +# reference: RandAugment: Practical automated data augmentation with a reduced search space +# https://arxiv.org/abs/1909.13719 + +""" Auto Augmentation """ +# reference: AutoAugment: Learning Augmentation Policies from Data +# https://arxiv.org/abs/1805.09501 + +import random +import numpy as np +from PIL import Image, ImageEnhance, ImageOps + + +def auto_augment_policy_original(): + """25 types of augment policies in original paper""" + policy = [ + [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + policy = [[SubPolicy(*args) for args in subpolicy] for subpolicy in policy] + return policy + + +def rand_augment_policy_original(magnitude_idx=9): + """ + 14 types of augment policies in original paper + Args: + magnitude_idx: M + """ + policy = [ + ('Posterize', 1, magnitude_idx), ('Rotate', 1, magnitude_idx), + ('Solarize', 1, magnitude_idx), ('AutoContrast', 1, magnitude_idx), + ('Equalize', 1, magnitude_idx), ('Contrast', 1, magnitude_idx), + ('Color', 1, magnitude_idx), ('Invert', 1, magnitude_idx), + ('Sharpness', 1, magnitude_idx), ('Brightness', 1, magnitude_idx), + ('ShearX', 1, magnitude_idx), ('ShearY', 1, magnitude_idx), + ('TranslateX', 1, magnitude_idx), ('TranslateY', 1, magnitude_idx), + ] + policy = [SubPolicy(*args) for args in policy] + return policy + + +class AutoAugment(): + """Auto Augment + Randomly choose a tuple of augment ops from a list of policy + Then apply the tuple of augment ops to input image + + Examples: + policy = auto_augment_policy_original() + augment = AutoAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy): + self.policy = policy + + def __call__(self, image, policy_idx=None): + if policy_idx is None: + policy_idx = random.randint(0, len(self.policy) - 1) + + sub_policy = self.policy[policy_idx] + for op in sub_policy: + image = op(image) + return image + + +class RandAugment(): + """Rand Augment + Randomly choose N augment ops from a list of K policies + Then apply the N ops to input image + + Examples: + policy = rand_augment_policy_original(magnitude_idx) + augment = RandAugment(policy) + transformed_image = augment(image) + """ + + def __init__(self, policy, num_layers=2): + """ + Args: + policy: list of SubPolicy + num_layers: int + """ + self.policy = policy + self.num_layers = num_layers + + def __call__(self, image): + selected_idx = np.random.choice(len(self.policy), self.num_layers) + + for policy_idx in selected_idx: + sub_policy = self.policy[policy_idx] + image = sub_policy(image) + return image + + +class SubPolicy: + """Subpolicy + Read augment name and magnitude, apply augment with probability + Args: + op_name: str, augment operation name + prob: float, if prob > random prob, apply augment + magnitude_idx: int, index of magnitude in preset magnitude ranges + """ + + def __init__(self, op_name, prob, magnitude_idx): + # ranges of operations' magnitude + ranges = { + 'ShearX': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'ShearY': np.linspace(0, 0.3, 10), # [-0.3, 0.3] (by random negative) + 'TranslateX': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'TranslateY': np.linspace(0, 150 / 331, 10), # [-0.45, 0.45] (by random negative) + 'Rotate': np.linspace(0, 30, 10), # [-30, 30] (by random negative) + 'Color': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Posterize': np.round(np.linspace(8, 4, 10), 0).astype(np.int), # [0, 4] + 'Solarize': np.linspace(256, 0, 10), # [0, 256] + 'Contrast': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Sharpness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'Brightness': np.linspace(0, 0.9, 10), # [-0.9, 0.9] (by random negative) + 'AutoContrast': [0] * 10, # no range + 'Equalize': [0] * 10, # no range + 'Invert': [0] * 10, # no range + } + + # augmentation operations + # Lambda is not pickleable for DDP + # image_ops = { + # 'ShearX': lambda image, magnitude: shear_x(image, magnitude), + # 'ShearY': lambda image, magnitude: shear_y(image, magnitude), + # 'TranslateX': lambda image, magnitude: translate_x(image, magnitude), + # 'TranslateY': lambda image, magnitude: translate_y(image, magnitude), + # 'Rotate': lambda image, magnitude: rotate(image, magnitude), + # 'AutoContrast': lambda image, magnitude: auto_contrast(image, magnitude), + # 'Invert': lambda image, magnitude: invert(image, magnitude), + # 'Equalize': lambda image, magnitude: equalize(image, magnitude), + # 'Solarize': lambda image, magnitude: solarize(image, magnitude), + # 'Posterize': lambda image, magnitude: posterize(image, magnitude), + # 'Contrast': lambda image, magnitude: contrast(image, magnitude), + # 'Color': lambda image, magnitude: color(image, magnitude), + # 'Brightness': lambda image, magnitude: brightness(image, magnitude), + # 'Sharpness': lambda image, magnitude: sharpness(image, magnitude), + # } + image_ops = { + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_relative, + 'TranslateY': translate_y_relative, + 'Rotate': rotate, + 'AutoContrast': auto_contrast, + 'Invert': invert, + 'Equalize': equalize, + 'Solarize': solarize, + 'Posterize': posterize, + 'Contrast': contrast, + 'Color': color, + 'Brightness': brightness, + 'Sharpness': sharpness, + } + + self.prob = prob + self.magnitude = ranges[op_name][magnitude_idx] + self.op = image_ops[op_name] + + def __call__(self, image): + if self.prob > random.random(): + image = self.op(image, self.magnitude) + return image + + +# PIL Image transforms +# https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.transform +def shear_x(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), fillcolor=fillcolor) + + +def shear_y(image, magnitude, fillcolor=(128, 128, 128)): + factor = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), fillcolor=fillcolor) + + +def translate_x_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_relative(image, magnitude, fillcolor=(128, 128, 128)): + pixels = magnitude * image.size[0] + pixels = pixels * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), fillcolor=fillcolor) + + +def translate_x_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, magnitude, 0, 1, 0), fillcolor=fillcolor) + + +def translate_y_absolute(image, magnitude, fillcolor=(128, 128, 128)): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude), fillcolor=fillcolor) + + +def rotate(image, magnitude): + rot = image.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new('RGBA', rot.size, (128,) * 4), + rot).convert(image.mode) + + +def auto_contrast(image, magnitude=None): + return ImageOps.autocontrast(image) + + +def invert(image, magnitude=None): + return ImageOps.invert(image) + + +def equalize(image, magnitude=None): + return ImageOps.equalize(image) + + +def solarize(image, magnitude): + return ImageOps.solarize(image, magnitude) + + +def posterize(image, magnitude): + return ImageOps.posterize(image, magnitude) + + +def contrast(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Contrast(image).enhance(1 + magnitude) + + +def color(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Color(image).enhance(1 + magnitude) + + +def brightness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Brightness(image).enhance(1 + magnitude) + + +def sharpness(image, magnitude): + magnitude = magnitude * random.choice([-1, 1]) # random negative + return ImageEnhance.Sharpness(image).enhance(1 + magnitude) + diff --git a/image_classification/gMLP/config.py b/image_classification/gMLP/config.py index 27f7f23a..4c6e755d 100644 --- a/image_classification/gMLP/config.py +++ b/image_classification/gMLP/config.py @@ -45,8 +45,9 @@ _C.MODEL.RESUME = None _C.MODEL.PRETRAINED = None _C.MODEL.NUM_CLASSES = 1000 -_C.MODEL.DROPOUT = 0.1 -_C.MODEL.DROPPATH = 0.1 +_C.MODEL.DROPOUT = 0.0 +_C.MODEL.ATTENTION_DROPOUT = 0.0 +_C.MODEL.DROP_PATH = 0.1 # transformer settings _C.MODEL.MIXER = CN() @@ -59,13 +60,14 @@ _C.TRAIN = CN() _C.TRAIN.LAST_EPOCH = 0 _C.TRAIN.NUM_EPOCHS = 300 -_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size -_C.TRAIN.WEIGHT_DECAY = 0.01 #0.3 # 0.0 for finetune -_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune -_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 -_C.TRAIN.END_LR = 1e-5 -_C.TRAIN.GRAD_CLIP = 1.0 -_C.TRAIN.ACCUM_ITER = 2 #1 +_C.TRAIN.WARMUP_EPOCHS = 20 +_C.TRAIN.WEIGHT_DECAY = 0.05 +_C.TRAIN.BASE_LR = 0.001 +_C.TRAIN.WARMUP_START_LR = 5e-7 +_C.TRAIN.END_LR = 5e-6 +_C.TRAIN.GRAD_CLIP = 5.0 +_C.TRAIN.ACCUM_ITER = 1 +_C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() _C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' @@ -79,6 +81,24 @@ _C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) # for adamW _C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 +# train augmentation +_C.TRAIN.MIXUP_ALPHA = 0.8 +_C.TRAIN.CUTMIX_ALPHA = 1.0 +_C.TRAIN.CUTMIX_MINMAX = None +_C.TRAIN.MIXUP_PROB = 1.0 +_C.TRAIN.MIXUP_SWITCH_PROB = 0.5 +_C.TRAIN.MIXUP_MODE = 'batch' + +_C.TRAIN.SMOOTHING = 0.1 +_C.TRAIN.COLOR_JITTER = 0.4 +_C.TRAIN.AUTO_AUGMENT = False #'rand-m9-mstd0.5-inc1' +_C.TRAIN.RAND_AUGMENT = False + +_C.TRAIN.RANDOM_ERASE_PROB = 0.25 +_C.TRAIN.RANDOM_ERASE_MODE = 'pixel' +_C.TRAIN.RANDOM_ERASE_COUNT = 1 +_C.TRAIN.RANDOM_ERASE_SPLIT = False + # misc _C.SAVE = "./output" _C.TAG = "default" @@ -121,6 +141,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/gMLP/datasets.py b/image_classification/gMLP/datasets.py index 761dd61a..304df9a3 100644 --- a/image_classification/gMLP/datasets.py +++ b/image_classification/gMLP/datasets.py @@ -19,8 +19,19 @@ import os import math -from paddle.io import Dataset, DataLoader, DistributedBatchSampler -from paddle.vision import transforms, datasets, image_load +from PIL import Image +from paddle.io import Dataset +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from paddle.vision import transforms +from paddle.vision import datasets +from paddle.vision import image_load +from augment import auto_augment_policy_original +from augment import AutoAugment +from augment import rand_augment_policy_original +from augment import RandAugment +from transforms import RandomHorizontalFlip +from random_erasing import RandomErasing class ImageNet2012Dataset(Dataset): @@ -81,12 +92,36 @@ def get_train_transforms(config): transforms_train: training transforms """ - transforms_train = transforms.Compose([ + aug_op_list = [] + # STEP1: random crop and resize + aug_op_list.append( transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), - scale=(0.05, 1.0)), - transforms.ToTensor(), - transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), - ]) + scale=(0.05, 1.0), interpolation='bicubic')) + # STEP2: auto_augment or color jitter + if config.TRAIN.AUTO_AUGMENT: + policy = auto_augment_policy_original() + auto_augment = AutoAugment(policy) + aug_op_list.append(auto_augment) + elif config.TRAIN.RAND_AUGMENT: + policy = rand_augment_policy_original() + rand_augment = RandAugment(policy) + aug_op_list.append(rand_augment) + else: + jitter = (float(config.TRAIN.COLOR_JITTER), ) * 3 + aug_op_list.append(transforms.ColorJitter(*jitter)) + # STEP3: other ops + aug_op_list.append(transforms.ToTensor()) + aug_op_list.append(transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, + std=config.DATA.IMAGENET_STD)) + # STEP4: random erasing + if config.TRAIN.RANDOM_ERASE_PROB > 0.: + random_erasing = RandomErasing(prob=config.TRAIN.RANDOM_ERASE_PROB, + mode=config.TRAIN.RANDOM_ERASE_MODE, + max_count=config.TRAIN.RANDOM_ERASE_COUNT, + num_splits=config.TRAIN.RANDOM_ERASE_SPLIT) + aug_op_list.append(random_erasing) + # Final: compose transforms and return + transforms_train = transforms.Compose(aug_op_list) return transforms_train @@ -106,7 +141,7 @@ def get_val_transforms(config): scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) transforms_val = transforms.Compose([ - transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image + transforms.Resize(scale_size, interpolation='bicubic'), transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), transforms.ToTensor(), transforms.Normalize(mean=config.DATA.IMAGENET_MEAN, std=config.DATA.IMAGENET_STD), @@ -124,6 +159,7 @@ def get_dataset(config, mode='train'): Returns: dataset: dataset object """ + assert mode in ['train', 'val'] if config.DATA.DATASET == "cifar10": if mode == 'train': diff --git a/image_classification/gMLP/gmlp.py b/image_classification/gMLP/gmlp.py index bbbcaaed..25d8c5d8 100644 --- a/image_classification/gMLP/gmlp.py +++ b/image_classification/gMLP/gmlp.py @@ -200,5 +200,5 @@ def build_gated_mlp(config): embed_dim=config.MODEL.MIXER.HIDDEN_SIZE, mlp_ratio=config.MODEL.MIXER.MLP_RATIO, dropout=config.MODEL.DROPOUT, - droppath=config.MODEL.DROPPATH) + droppath=config.MODEL.DROP_PATH) return model diff --git a/image_classification/gMLP/losses.py b/image_classification/gMLP/losses.py new file mode 100644 index 00000000..082467a3 --- /dev/null +++ b/image_classification/gMLP/losses.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Implement Loss functions """ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class LabelSmoothingCrossEntropyLoss(nn.Layer): + """ cross entropy loss for label smoothing + Args: + smoothing: float, smoothing rate + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, target label with shape [N] + Return: + loss: float, cross entropy loss value + """ + def __init__(self, smoothing=0.1): + super().__init__() + assert 0 <= smoothing < 1.0 + self.smoothing = smoothing + self.confidence = 1 - smoothing + + def forward(self, x, target): + log_probs = F.log_softmax(x) # [N, num_classes] + # target_index is used to get prob for each of the N samples + target_index = paddle.zeros([x.shape[0], 2], dtype='int64') # [N, 2] + target_index[:, 0] = paddle.arange(x.shape[0]) + target_index[:, 1] = target + + nll_loss = -log_probs.gather_nd(index=target_index) # index: [N] + smooth_loss = -log_probs.mean(axis=-1) + loss = self.confidence * nll_loss + self.smoothing * smooth_loss + return loss.mean() + + +class SoftTargetCrossEntropyLoss(nn.Layer): + """ cross entropy loss for soft target + Args: + x: tensor, predictions (before softmax) with shape [N, num_classes] + target: tensor, soft target with shape [N, num_classes] + Returns: + loss: float, the mean loss value + """ + def __init__(self): + super().__init__() + + def forward(self, x, target): + loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1) + return loss.mean() + + +class DistillationLoss(nn.Layer): + """Distillation loss function + This layer includes the orginal loss (criterion) and a extra + distillation loss (criterion), which computes the loss with + different type options, between current model and + a teacher model as its supervision. + + Args: + base_criterion: nn.Layer, the original criterion + teacher_model: nn.Layer, the teacher model as supervision + distillation_type: str, one of ['none', 'soft', 'hard'] + alpha: float, ratio of base loss (* (1-alpha)) + and distillation loss( * alpha) + tao: float, temperature in distillation + """ + def __init__(self, + base_criterion, + teacher_model, + distillation_type, + alpha, + tau): + super().__init__() + assert distillation_type in ['none', 'soft', 'hard'] + self.base_criterion = base_criterion + self.teacher_model = teacher_model + self.type = distillation_type + self.alpha = alpha + self.tau = tau + + def forward(self, inputs, outputs, targets): + """ + Args: + inputs: tensor, the orginal model inputs + outputs: tensor, the outputs of the model + outputds_kd: tensor, the distillation outputs of the model, + this is usually obtained by a separate branch + in the last layer of the model + targets: tensor, the labels for the base criterion + """ + outputs, outputs_kd = outputs[0], outputs[1] + base_loss = self.base_criterion(outputs, targets) + if self.type == 'none': + return base_loss + + with paddle.no_grad(): + teacher_outputs = self.teacher_model(inputs) + + if self.type == 'soft': + distillation_loss = F.kl_div( + F.log_softmax(outputs_kd / self.tau, axis=1), + F.log_softmax(teacher_outputs / self.tau, axis=1), + reduction='sum') * (self.tau * self.tau) / outputs_kd.numel() + elif self.type == 'hard': + distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(axis=1)) + + loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha + return loss + + diff --git a/image_classification/gMLP/main_multi_gpu.py b/image_classification/gMLP/main_multi_gpu.py index 3003557e..436ce98b 100644 --- a/image_classification/gMLP/main_multi_gpu.py +++ b/image_classification/gMLP/main_multi_gpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,53 +27,53 @@ import paddle.distributed as dist from datasets import get_dataloader from datasets import get_dataset -from gmlp import build_gated_mlp as build_model from utils import AverageMeter from utils import WarmupCosineScheduler from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from gmlp import build_gated_mlp as build_model -parser = argparse.ArgumentParser('gMLP') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -arguments = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, arguments) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('gMLP') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -81,20 +81,28 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + mixup_fn=None, + amp=False, + local_logger=None, + master_logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch debug_steps: int, num of iters to log info, default: 100 accum_iter: int, num of iters for accumulating gradients, default: 1 + mixup_fn: Mixup, mixup instance, default: None amp: bool, if True, use mix precision training, default: False + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: train_loss_meter.avg train_acc_meter.avg @@ -103,31 +111,36 @@ def train(dataloader, model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + master_train_loss_meter = AverageMeter() + master_train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - + time_st = time.time() for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function # Here no division is needed: # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean' - # #loss = loss / accum_iter loss.backward() @@ -136,41 +149,82 @@ def train(dataloader, optimizer.clear_grad() pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) - batch_size = image.shape[0] - train_loss_meter.update(loss.numpy()[0], batch_size) - train_acc_meter.update(acc.numpy()[0], batch_size) + batch_size = paddle.to_tensor(image.shape[0]) - if batch_id % debug_steps == 0: - logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {train_loss_meter.avg:.4f}, " + - f"Avg Acc: {train_acc_meter.avg:.4f}") + # sync from other gpus for overall loss and acc + master_loss = loss.clone() + master_acc = acc.clone() + master_batch_size = batch_size.clone() + dist.all_reduce(master_loss) + dist.all_reduce(master_acc) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc = master_acc / dist.get_world_size() + master_train_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_train_acc_meter.update(master_acc.numpy()[0], master_batch_size.numpy()[0]) - train_time = time.time() - time_st - return train_loss_meter.avg, train_acc_meter.avg, train_time + train_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) + train_acc_meter.update(acc.numpy()[0], batch_size.numpy()[0]) + if batch_id % debug_steps == 0: + if local_logger: + local_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {train_loss_meter.avg:.4f}, " + + f"Avg Acc: {train_acc_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_train_loss_meter.avg:.4f}, " + + f"Avg Acc: {master_train_acc_meter.avg:.4f}") -def validate(dataloader, model, criterion, total_batch, debug_steps=100): + train_time = time.time() - time_st + return (train_loss_meter.avg, + train_acc_meter.avg, + master_train_loss_meter.avg, + master_train_acc_meter.avg, + train_time) + + +def validate(dataloader, + model, + criterion, + total_batch, + debug_steps=100, + local_logger=None, + master_logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + debug_steps: int, num of iters to log info, default: 100 + local_logger: logger for local process/gpu, default: None + master_logger: logger for main process, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + master_val_loss_meter.avg: float, average loss on all processes/gpus + master_val_acc1_meter.avg: float, average top1 accuracy on all processes/gpus + master_val_acc5_meter.avg: float, average top5 accuracy on all processes/gpus + val_time: float, validation time """ model.eval() val_loss_meter = AverageMeter() val_acc1_meter = AverageMeter() val_acc5_meter = AverageMeter() + master_val_loss_meter = AverageMeter() + master_val_acc1_meter = AverageMeter() + master_val_acc5_meter = AverageMeter() time_st = time.time() with paddle.no_grad(): @@ -185,63 +239,140 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1)) acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5) - dist.all_reduce(loss) - dist.all_reduce(acc1) - dist.all_reduce(acc5) - loss = loss / dist.get_world_size() - acc1 = acc1 / dist.get_world_size() - acc5 = acc5 / dist.get_world_size() - batch_size = paddle.to_tensor(image.shape[0]) - dist.all_reduce(batch_size) + + master_loss = loss.clone() + master_acc1 = acc1.clone() + master_acc5 = acc5.clone() + master_batch_size = batch_size.clone() + + dist.all_reduce(master_loss) + dist.all_reduce(master_acc1) + dist.all_reduce(master_acc5) + dist.all_reduce(master_batch_size) + master_loss = master_loss / dist.get_world_size() + master_acc1 = master_acc1 / dist.get_world_size() + master_acc5 = master_acc5 / dist.get_world_size() + + master_val_loss_meter.update(master_loss.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc1_meter.update(master_acc1.numpy()[0], master_batch_size.numpy()[0]) + master_val_acc5_meter.update(master_acc5.numpy()[0], master_batch_size.numpy()[0]) val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0]) val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0]) if batch_id % debug_steps == 0: - logger.info( - f"Val Step[{batch_id:04d}/{total_batch:04d}], " + - f"Avg Loss: {val_loss_meter.avg:.4f}, " + - f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + - f"Avg Acc@5: {val_acc5_meter.avg:.4f}") - + if local_logger: + local_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {val_acc5_meter.avg:.4f}") + if master_logger and dist.get_rank() == 0: + master_logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {master_val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {master_val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {master_val_acc5_meter.avg:.4f}") val_time = time.time() - time_st - return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time + return (val_loss_meter.avg, + val_acc1_meter.avg, + val_acc5_meter.avg, + master_val_loss_meter.avg, + master_val_acc1_meter.avg, + master_val_acc5_meter.avg, + val_time) def main_worker(*args): - # 0. Preparation + # STEP 0: Preparation + config = args[0] dist.init_parallel_env() last_epoch = config.TRAIN.LAST_EPOCH - world_size = paddle.distributed.get_world_size() - local_rank = paddle.distributed.get_rank() - logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + world_size = dist.get_world_size() + local_rank = dist.get_rank() seed = config.SEED + local_rank paddle.seed(seed) np.random.seed(seed) random.seed(seed) - # 1. Create model + # logger for each process/gpu + local_logger = get_logger( + filename=os.path.join(config.SAVE, 'log_{}.txt'.format(local_rank)), + logger_name='local_logger') + # overall logger + if local_rank == 0: + master_logger = get_logger( + filename=os.path.join(config.SAVE, 'log.txt'), + logger_name='master_logger') + master_logger.info(f'\n{config}') + else: + master_logger = None + local_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + if local_rank == 0: + master_logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + + # STEP 1: Create model model = build_model(config) model = paddle.DataParallel(model) - # 2. Create train and val dataloader + + # STEP 2: Create train and val dataloader dataset_train, dataset_val = args[1], args[2] # Create training dataloader if not config.EVAL: dataloader_train = get_dataloader(config, dataset_train, 'train', True) total_batch_train = len(dataloader_train) - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + local_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') if local_rank == 0: - logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + master_logger.info(f'----- Total # of train batch (single gpu): {total_batch_train}') # Create validation dataloader dataloader_val = get_dataloader(config, dataset_val, 'test', True) total_batch_val = len(dataloader_val) - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + local_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') if local_rank == 0: - logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define optimizer and lr_scheduler + master_logger.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE * world_size) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -263,7 +394,9 @@ def main_worker(*args): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + local_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") if config.TRAIN.OPTIMIZER.NAME == "SGD": @@ -294,77 +427,120 @@ def main_worker(*args): # 'absolute_pos_embed', 'relative_position_bias_table']), ) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + local_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + if local_rank == 0: + master_logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 5. Load pretrained model / load resumt model and optimizer states + # STEP 6: Load pretrained model / load resumt model and optimizer states if config.MODEL.PRETRAINED: if (config.MODEL.PRETRAINED).endswith('.pdparams'): raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) - logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + local_logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + if local_rank == 0: + master_logger.info( + f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") if config.MODEL.RESUME: - assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True - assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True - model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + assert os.path.isfile(config.MODEL.RESUME + '.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME + '.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME + '.pdparams') model.set_dict(model_state) opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') optimizer.set_state_dict(opt_state) - logger.info( - f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}") + local_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") + if local_rank == 0: + master_logger.info( + f"----- Resume Training: Load model and optmizer from {config.MODEL.RESUME}") - # 6. Validation + # STEP 7: Validation (eval mode) if config.EVAL: - logger.info('----- Start Validating') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info('----- Start Validating') + if local_rank == 0: + master_logger.info('----- Start Validating') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") return - # 6. Start training and validation - logging.info(f"Start training from epoch {last_epoch+1}.") + # STEP 8: Start training and validation (train mode) + local_logger.info(f"Start training from epoch {last_epoch+1}.") + if local_rank == 0: + master_logger.info(f"Start training from epoch {last_epoch+1}.") for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") - train_loss, train_acc, train_time = train(dataloader=dataloader_train, - model=model, - criterion=criterion, - optimizer=optimizer, - epoch=epoch, - total_batch=total_batch_train, - debug_steps=config.REPORT_FREQ, - accum_iter=config.TRAIN.ACCUM_ITER, - amp=config.AMP) + local_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + if local_rank == 0: + master_logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + train_loss, train_acc, avg_loss, avg_acc, train_time = train( + dataloader=dataloader_train, + model=model, + criterion=criterion, + optimizer=optimizer, + epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, + total_batch=total_batch_train, + debug_steps=config.REPORT_FREQ, + accum_iter=config.TRAIN.ACCUM_ITER, + mixup_fn=mixup_fn, + amp=config.AMP, + local_logger=local_logger, + master_logger=master_logger) + scheduler.step() - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Train Loss: {train_loss:.4f}, " + - f"Train Acc: {train_acc:.4f}, " + - f"time: {train_time:.2f}") + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {train_loss:.4f}, " + + f"Train Acc: {train_acc:.4f}, " + + f"time: {train_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {avg_loss:.4f}, " + + f"Train Acc: {avg_acc:.4f}, " + + f"time: {train_time:.2f}") + # validation if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: - logger.info(f'----- Validation after Epoch: {epoch}') - val_loss, val_acc1, val_acc5, val_time = validate( + local_logger.info(f'----- Validation after Epoch: {epoch}') + if local_rank == 0: + master_logger.info(f'----- Validation after Epoch: {epoch}') + val_loss, val_acc1, val_acc5, avg_loss, avg_acc1, avg_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=total_batch_val, - debug_steps=config.REPORT_FREQ) - logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + - f"Validation Loss: {val_loss:.4f}, " + - f"Validation Acc@1: {val_acc1:.4f}, " + - f"Validation Acc@5: {val_acc5:.4f}, " + - f"time: {val_time:.2f}") + debug_steps=config.REPORT_FREQ, + local_logger=local_logger, + master_logger=master_logger) + local_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + if local_rank == 0: + master_logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {avg_loss:.4f}, " + + f"Validation Acc@1: {avg_acc1:.4f}, " + + f"Validation Acc@5: {avg_acc5:.4f}, " + + f"time: {val_time:.2f}") # model save if local_rank == 0: if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: @@ -372,18 +548,33 @@ def main_worker(*args): config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") paddle.save(model.state_dict(), model_path + '.pdparams') paddle.save(optimizer.state_dict(), model_path + '.pdopt') - logger.info(f"----- Save model: {model_path}.pdparams") - logger.info(f"----- Save optim: {model_path}.pdopt") + master_logger.info(f"----- Save model: {model_path}.pdparams") + master_logger.info(f"----- Save optim: {model_path}.pdopt") def main(): + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) + + # get dataset and start DDP if not config.EVAL: dataset_train = get_dataset(config, mode='train') else: dataset_train = None dataset_val = get_dataset(config, mode='val') config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS - dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS) + dist.spawn(main_worker, args=(config, dataset_train, dataset_val, ), nprocs=config.NGPUS) if __name__ == "__main__": diff --git a/image_classification/gMLP/main_single_gpu.py b/image_classification/gMLP/main_single_gpu.py index 4ada54d3..83e2d8b6 100644 --- a/image_classification/gMLP/main_single_gpu.py +++ b/image_classification/gMLP/main_single_gpu.py @@ -1,5 +1,4 @@ - -# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,55 +26,54 @@ import paddle.nn.functional as F from datasets import get_dataloader from datasets import get_dataset -from gmlp import build_gated_mlp as build_model from utils import AverageMeter from utils import WarmupCosineScheduler +from utils import get_exclude_from_weight_decay_fn from config import get_config from config import update_config +from mixup import Mixup +from losses import LabelSmoothingCrossEntropyLoss +from losses import SoftTargetCrossEntropyLoss +from gmlp import build_gated_mlp as build_model -parser = argparse.ArgumentParser('gMLP') -parser.add_argument('-cfg', type=str, default=None) -parser.add_argument('-dataset', type=str, default=None) -parser.add_argument('-batch_size', type=int, default=None) -parser.add_argument('-image_size', type=int, default=None) -parser.add_argument('-data_path', type=str, default=None) -parser.add_argument('-output', type=str, default=None) -parser.add_argument('-ngpus', type=int, default=None) -parser.add_argument('-pretrained', type=str, default=None) -parser.add_argument('-resume', type=str, default=None) -parser.add_argument('-last_epoch', type=int, default=None) -parser.add_argument('-eval', action='store_true') -parser.add_argument('-amp', action='store_true') -args = parser.parse_args() - - -log_format = "%(asctime)s %(message)s" -logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format=log_format, datefmt="%m%d %I:%M:%S %p") - -# get default config -config = get_config() -# update config by arguments -config = update_config(config, args) - -# set output folder -if not config.EVAL: - config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) -else: - config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) - -#config.freeze() - -if not os.path.exists(config.SAVE): - os.makedirs(config.SAVE, exist_ok=True) - -# set logging format -logger = logging.getLogger() -fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) -fh.setFormatter(logging.Formatter(log_format)) -logger.addHandler(fh) -logger.info(f'config= {config}') +def get_arguments(): + """return argumeents, this will overwrite the config after loading yaml file""" + parser = argparse.ArgumentParser('gMLP') + parser.add_argument('-cfg', type=str, default=None) + parser.add_argument('-dataset', type=str, default=None) + parser.add_argument('-batch_size', type=int, default=None) + parser.add_argument('-image_size', type=int, default=None) + parser.add_argument('-data_path', type=str, default=None) + parser.add_argument('-output', type=str, default=None) + parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) + parser.add_argument('-pretrained', type=str, default=None) + parser.add_argument('-resume', type=str, default=None) + parser.add_argument('-last_epoch', type=int, default=None) + parser.add_argument('-eval', action='store_true') + parser.add_argument('-amp', action='store_true') + arguments = parser.parse_args() + return arguments + + +def get_logger(filename, logger_name=None): + """set logging file and format + Args: + filename: str, full path of the logger file to write + logger_name: str, the logger name, e.g., 'master_logger', 'local_logger' + Return: + logger: python logger + """ + log_format = "%(asctime)s %(message)s" + logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + # different name is needed when creating multiple logger in one process + logger = logging.getLogger(logger_name) + fh = logging.FileHandler(os.path.join(filename)) + fh.setFormatter(logging.Formatter(log_format)) + logger.addHandler(fh) + return logger def train(dataloader, @@ -83,49 +81,57 @@ def train(dataloader, criterion, optimizer, epoch, + total_epochs, total_batch, debug_steps=100, accum_iter=1, - amp=False): + mixup_fn=None, + amp=False, + logger=None): """Training for one epoch Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion epoch: int, current epoch - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info - accum_iter: int, num of iters for accumulating gradients - amp: bool, if True, use mix precision training + total_epochs: int, total num of epochs + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + accum_iter: int, num of iters for accumulating gradients, default: 1 + mixup_fn: Mixup, mixup instance, default: None + amp: bool, if True, use mix precision training, default: False + logger: logger for logging, default: None Returns: - train_loss_meter.avg - train_acc_meter.avg - train_time + train_loss_meter.avg: float, average loss on current process/gpu + train_acc_meter.avg: float, average top1 accuracy on current process/gpu + train_time: float, training time """ model.train() train_loss_meter = AverageMeter() train_acc_meter = AverageMeter() + if amp is True: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) time_st = time.time() - for batch_id, data in enumerate(dataloader): image = data[0] label = data[1] + label_orig = label.clone() - if amp is True: + if mixup_fn is not None: + image, label = mixup_fn(image, label_orig) + + if amp is True: # mixed precision training with paddle.amp.auto_cast(): output = model(image) loss = criterion(output, label) scaled = scaler.scale(loss) scaled.backward() - if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): scaler.minimize(optimizer, scaled) optimizer.clear_grad() - - else: + else: # full precision training output = model(image) loss = criterion(output, label) #NOTE: division may be needed depending on the loss function @@ -139,15 +145,18 @@ def train(dataloader, optimizer.clear_grad() pred = F.softmax(output) - acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + if mixup_fn: + acc = paddle.metric.accuracy(pred, label_orig) + else: + acc = paddle.metric.accuracy(pred, label_orig.unsqueeze(1)) batch_size = image.shape[0] train_loss_meter.update(loss.numpy()[0], batch_size) train_acc_meter.update(acc.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( - f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Epoch[{epoch:03d}/{total_epochs:03d}], " + f"Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {train_loss_meter.avg:.4f}, " + f"Avg Acc: {train_acc_meter.avg:.4f}") @@ -156,19 +165,20 @@ def train(dataloader, return train_loss_meter.avg, train_acc_meter.avg, train_time -def validate(dataloader, model, criterion, total_batch, debug_steps=100): +def validate(dataloader, model, criterion, total_batch, debug_steps=100, logger=None): """Validation for whole dataset Args: dataloader: paddle.io.DataLoader, dataloader instance model: nn.Layer, a ViT model criterion: nn.criterion - total_epoch: int, total num of epoch, for logging - debug_steps: int, num of iters to log info + total_batch: int, total num of batches for one epoch + debug_steps: int, num of iters to log info, default: 100 + logger: logger for logging, default: None Returns: - val_loss_meter.avg - val_acc1_meter.avg - val_acc5_meter.avg - val_time + val_loss_meter.avg: float, average loss on current process/gpu + val_acc1_meter.avg: float, average top1 accuracy on current process/gpu + val_acc5_meter.avg: float, average top5 accuracy on current process/gpu + val_time: float, valitaion time """ model.eval() val_loss_meter = AverageMeter() @@ -193,7 +203,7 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): val_acc1_meter.update(acc1.numpy()[0], batch_size) val_acc5_meter.update(acc5.numpy()[0], batch_size) - if batch_id % debug_steps == 0: + if logger and batch_id % debug_steps == 0: logger.info( f"Val Step[{batch_id:04d}/{total_batch:04d}], " + f"Avg Loss: {val_loss_meter.avg:.4f}, " + @@ -205,25 +215,77 @@ def validate(dataloader, model, criterion, total_batch, debug_steps=100): def main(): - # 0. Preparation + # STEP 0: Preparation + # config is updated by: (1) config.py, (2) yaml file, (3) arguments + arguments = get_arguments() + config = get_config() + config = update_config(config, arguments) + # set output folder + if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) last_epoch = config.TRAIN.LAST_EPOCH seed = config.SEED paddle.seed(seed) np.random.seed(seed) random.seed(seed) - #paddle.set_device('gpu:0') - # 1. Create model + logger = get_logger(filename=os.path.join(config.SAVE, 'log.txt')) + logger.info(f'\n{config}') + + # STEP 1: Create model model = build_model(config) - # 2. Create train and val dataloader + + # STEP 2: Create train and val dataloader if not config.EVAL: dataset_train = get_dataset(config, mode='train') dataloader_train = get_dataloader(config, dataset_train, 'train', False) dataset_val = get_dataset(config, mode='val') dataloader_val = get_dataloader(config, dataset_val, 'val', False) - # 3. Define criterion - criterion = nn.CrossEntropyLoss() - # 4. Define lr_scheduler + # STEP 3: Define Mixup function + mixup_fn = None + if config.TRAIN.MIXUP_PROB > 0 or config.TRAIN.CUTMIX_ALPHA > 0 or config.TRAIN.CUTMIX_MINMAX is not None: + mixup_fn = Mixup(mixup_alpha=config.TRAIN.MIXUP_ALPHA, + cutmix_alpha=config.TRAIN.CUTMIX_ALPHA, + cutmix_minmax=config.TRAIN.CUTMIX_MINMAX, + prob=config.TRAIN.MIXUP_PROB, + switch_prob=config.TRAIN.MIXUP_SWITCH_PROB, + mode=config.TRAIN.MIXUP_MODE, + label_smoothing=config.TRAIN.SMOOTHING, + num_classes=config.MODEL.NUM_CLASSES) + + # STEP 4: Define criterion + if config.TRAIN.MIXUP_PROB > 0.: + criterion = SoftTargetCrossEntropyLoss() + elif config.TRAIN.SMOOTHING: + criterion = LabelSmoothingCrossEntropyLoss() + else: + criterion = nn.CrossEntropyLoss() + # only use cross entropy for val + criterion_val = nn.CrossEntropyLoss() + + # STEP 5: Define optimizer and lr_scheduler + # set lr according to batch size and world size (hacked from Swin official code and modified for CSwin) + if config.TRAIN.LINEAR_SCALED_LR is not None: + linear_scaled_lr = ( + config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_warmup_start_lr = ( + config.TRAIN.WARMUP_START_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + linear_scaled_end_lr = ( + config.TRAIN.END_LR * config.DATA.BATCH_SIZE) / config.TRAIN.LINEAR_SCALED_LR + + if config.TRAIN.ACCUM_ITER > 1: + linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUM_ITER + linear_scaled_warmup_start_lr = linear_scaled_warmup_start_lr * config.TRAIN.ACCUM_ITER + linear_scaled_end_lr = linear_scaled_end_lr * config.TRAIN.ACCUM_ITER + + config.TRAIN.BASE_LR = linear_scaled_lr + config.TRAIN.WARMUP_START_LR = linear_scaled_warmup_start_lr + config.TRAIN.END_LR = linear_scaled_end_lr + scheduler = None if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, @@ -232,8 +294,7 @@ def main(): end_lr=config.TRAIN.END_LR, warmup_epochs=config.TRAIN.WARMUP_EPOCHS, total_epochs=config.TRAIN.NUM_EPOCHS, - last_epoch=config.TRAIN.LAST_EPOCH, - ) + last_epoch=config.TRAIN.LAST_EPOCH) elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, T_max=config.TRAIN.NUM_EPOCHS, @@ -245,9 +306,9 @@ def main(): gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, last_epoch=last_epoch) else: - logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + logger.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") - # 5. Define optimizer + if config.TRAIN.OPTIMIZER.NAME == "SGD": if config.TRAIN.GRAD_CLIP: clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP) @@ -267,18 +328,21 @@ def main(): optimizer = paddle.optimizer.AdamW( parameters=model.parameters(), learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR, - weight_decay=config.TRAIN.WEIGHT_DECAY, beta1=config.TRAIN.OPTIMIZER.BETAS[0], beta2=config.TRAIN.OPTIMIZER.BETAS[1], + weight_decay=config.TRAIN.WEIGHT_DECAY, epsilon=config.TRAIN.OPTIMIZER.EPS, grad_clip=clip) else: - logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + logger.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") - # 6. Load pretrained model or load resume model and optimizer states + + # STEP 6: Load pretrained model or load resume model and optimizer states if config.MODEL.PRETRAINED: - assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') - model_state = paddle.load(config.MODEL.PRETRAINED + '.pdparams') + if (config.MODEL.PRETRAINED).endswith('.pdparams'): + raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') + assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True + model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') model.set_dict(model_state) logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") @@ -291,35 +355,40 @@ def main(): optimizer.set_state_dict(opt_state) logger.info( f"----- Resume: Load model and optmizer from {config.MODEL.RESUME}") - # 7. Validation + + # STEP 7: Validation (eval mode) if config.EVAL: logger.info('----- Start Validating') val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + f"Validation Acc@5: {val_acc5:.4f}, " + f"time: {val_time:.2f}") return - # 8. Start training and validation - logging.info(f"Start training from epoch {last_epoch + 1}.") - for epoch in range(last_epoch + 1, config.TRAIN.NUM_EPOCHS + 1): + + # STEP 8: Start training and validation (train mode) + logger.info(f"Start training from epoch {last_epoch+1}.") + for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): # train - logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + logger.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") train_loss, train_acc, train_time = train(dataloader=dataloader_train, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, + total_epochs=config.TRAIN.NUM_EPOCHS, total_batch=len(dataloader_train), debug_steps=config.REPORT_FREQ, accum_iter=config.TRAIN.ACCUM_ITER, + mixup_fn=mixup_fn, amp=config.AMP, - ) + logger=logger) scheduler.step() logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Train Loss: {train_loss:.4f}, " + @@ -331,9 +400,10 @@ def main(): val_loss, val_acc1, val_acc5, val_time = validate( dataloader=dataloader_val, model=model, - criterion=criterion, + criterion=criterion_val, total_batch=len(dataloader_val), - debug_steps=config.REPORT_FREQ) + debug_steps=config.REPORT_FREQ, + logger=logger) logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Acc@1: {val_acc1:.4f}, " + diff --git a/image_classification/gMLP/mixup.py b/image_classification/gMLP/mixup.py new file mode 100644 index 00000000..1d2db493 --- /dev/null +++ b/image_classification/gMLP/mixup.py @@ -0,0 +1,225 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""mixup and cutmix for batch data""" +import numpy as np +import paddle + + +def rand_bbox(image_shape, lam, count=None): + """ CutMix bbox by lam value + Generate 1 random bbox by value lam. lam is the cut size rate. + The cut_size is computed by sqrt(1-lam) * image_size. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + count: int, number of bbox to generate + """ + image_h, image_w = image_shape[-2:] + cut_rate = np.sqrt(1. - lam) + cut_h = int(cut_rate * image_h) + cut_w = int(cut_rate * image_w) + + # get random bbox center + cy = np.random.randint(0, image_h, size=count) + cx = np.random.randint(0, image_w, size=count) + + # get bbox coords + bbox_x1 = np.clip(cx - cut_w // 2, 0, image_w) + bbox_y1 = np.clip(cy - cut_h // 2, 0, image_h) + bbox_x2 = np.clip(cx + cut_w // 2, 0, image_w) + bbox_y2 = np.clip(cy + cut_h // 2, 0, image_h) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # while in pytorch, it will return [] tensor + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def rand_bbox_minmax(image_shape, minmax, count=None): + """ CutMix bbox by min and max value + Generate 1 random bbox by min and max percentage values. + Minmax is a tuple/list of min and max percentage vlaues + applied to the image width and height. + + Args: + image_shape: tuple/list, image height and width + minmax: tuple/list, min and max percentage values of image size + count: int, number of bbox to generate + """ + assert len(minmax) == 2 + image_h, image_w = image_shape[-2:] + min_ratio = minmax[0] + max_ratio = minmax[1] + cut_h = np.random.randint(int(image_h * min_ratio), int(image_h * max_ratio), size=count) + cut_w = np.random.randint(int(image_w * min_ratio), int(image_w * max_ratio), size=count) + + bbox_x1 = np.random.randint(0, image_w - cut_w, size=count) + bbox_y1 = np.random.randint(0, image_h - cut_h, size=count) + bbox_x2 = bbox_x1 + cut_w + bbox_y2 = bbox_y1 + cut_h + + return bbox_x1, bbox_y1, bbox_x2, bbox_y2 + + +def cutmix_generate_bbox_adjust_lam(image_shape, lam, minmax=None, correct_lam=True, count=None): + """Generate bbox and apply correction for lambda + If the mimmax is None, apply the standard cutmix by lam value, + If the minmax is set, apply the cutmix by min and max percentage values. + + Args: + image_shape: tuple/list, image height and width + lam: float, cutmix lambda value + minmax: tuple/list, min and max percentage values of image size + correct_lam: bool, if True, correct the lam value by the generated bbox + count: int, number of bbox to generate + """ + if minmax is not None: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox_minmax(image_shape, minmax, count) + else: + bbox_x1, bbox_y1, bbox_x2, bbox_y2 = rand_bbox(image_shape, lam, count) + + if correct_lam or minmax is not None: + image_h, image_w = image_shape[-2:] + bbox_area = (bbox_y2 - bbox_y1) * (bbox_x2 - bbox_x1) + lam = 1. - bbox_area / float(image_h * image_w) + return (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam + + +def one_hot(x, num_classes, on_value=1., off_value=0.): + """ Generate one-hot vector for label smoothing + Args: + x: tensor, contains label/class indices + num_classes: int, num of classes (len of the one-hot vector) + on_value: float, the vector value at label index, default=1. + off_value: float, the vector value at non-label indices, default=0. + Returns: + one_hot: tensor, tensor with on value at label index and off value + at non-label indices. + """ + x = x.reshape_([-1, 1]) + x_smoothed = paddle.full((x.shape[0], num_classes), fill_value=off_value) + for i in range(x.shape[0]): + x_smoothed[i, x[i]] = on_value + return x_smoothed + + +def mixup_one_hot(label, num_classes, lam=1., smoothing=0.): + """ mixup and label smoothing in batch + label smoothing is firstly applied, then + mixup is applied by mixing the bacth and its flip, + with a mixup rate. + + Args: + label: tensor, label tensor with shape [N], contains the class indices + num_classes: int, num of all classes + lam: float, mixup rate, default=1.0 + smoothing: float, label smoothing rate + """ + off_value = smoothing / num_classes + on_value = 1. - smoothing + off_value + y1 = one_hot(label, num_classes, on_value, off_value) + y2 = one_hot(label.flip(axis=[0]), num_classes, on_value, off_value) + return y2 * (1 - lam) + y1 * lam + + +class Mixup: + """Mixup class + Args: + mixup_alpha: float, mixup alpha for beta distribution, default=1.0, + cutmix_alpha: float, cutmix alpha for beta distribution, default=0.0, + cutmix_minmax: list/tuple, min and max value for cutmix ratio, default=None, + prob: float, if random prob < prob, do not use mixup, default=1.0, + switch_prob: float, prob of switching mixup and cutmix, default=0.5, + mode: string, mixup up, now only 'batch' is supported, default='batch', + correct_lam: bool, if True, apply correction of lam, default=True, + label_smoothing: float, label smoothing rate, default=0.1, + num_classes: int, num of classes, default=1000 + """ + def __init__(self, + mixup_alpha=1.0, + cutmix_alpha=0.0, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode='batch', + correct_lam=True, + label_smoothing=0.1, + num_classes=1000): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if cutmix_minmax is not None: + assert len(cutmix_minmax) == 2 + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = correct_lam + assert mode == 'batch', 'Now only batch mode is supported!' + + def __call__(self, x, target): + assert x.shape[0] % 2 == 0, "Batch size should be even" + lam = self._mix_batch(x) + target = mixup_one_hot(target, self.num_classes, lam, self.label_smoothing) + return x, target + + def get_params(self): + """Decide to use cutmix or regular mixup by sampling and + sample lambda for mixup + """ + lam = 1. + use_cutmix = False + use_mixup = np.random.rand() < self.mix_prob + if use_mixup: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand() < self.switch_prob + alpha = self.cutmix_alpha if use_cutmix else self.mixup_alpha + lam_mix = np.random.beta(alpha, alpha) + elif self.mixup_alpha == 0. and self.cutmix_alpha > 0.: + use_cutmix=True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + elif self.mixup_alpha > 0. and self.cutmix_alpha == 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + else: + raise ValueError('mixup_alpha and cutmix_alpha cannot be all 0') + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_batch(self, x): + """mixup/cutmix by adding batch data and its flipped version""" + lam, use_cutmix = self.get_params() + if lam == 1.: + return lam + if use_cutmix: + (bbox_x1, bbox_y1, bbox_x2, bbox_y2), lam = cutmix_generate_bbox_adjust_lam( + x.shape, + lam, + minmax=self.cutmix_minmax, + correct_lam=self.correct_lam) + + # NOTE: in paddle, tensor indexing e.g., a[x1:x2], + # if x1 == x2, paddle will raise ValueErros, + # but in pytorch, it will return [] tensor without errors + if int(bbox_x1) != int(bbox_x2) and int(bbox_y1) != int(bbox_y2): + x[:, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] = x.flip(axis=[0])[ + :, :, int(bbox_x1): int(bbox_x2), int(bbox_y1): int(bbox_y2)] + else: + x_flipped = x.flip(axis=[0]) + x_flipped = x_flipped * (1 - lam) + x.set_value(x * (lam) + x_flipped) + return lam diff --git a/image_classification/gMLP/random_erasing.py b/image_classification/gMLP/random_erasing.py new file mode 100644 index 00000000..31eea465 --- /dev/null +++ b/image_classification/gMLP/random_erasing.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Random Erasing for image tensor""" + +import random +import math +import paddle + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype="float32"): + if per_pixel: + return paddle.normal(shape=patch_size).astype(dtype) + if rand_color: + return paddle.normal(shape=(patch_size[0], 1, 1)).astype(dtype) + return paddle.zeros((patch_size[0], 1, 1)).astype(dtype) + + +class RandomErasing(object): + """ + Args: + prob: probability of performing random erasing + min_area: Minimum percentage of erased area wrt input image area + max_area: Maximum percentage of erased area wrt input image area + min_aspect: Minimum aspect ratio of earsed area + max_aspect: Maximum aspect ratio of earsed area + mode: pixel color mode, in ['const', 'rand', 'pixel'] + 'const' - erase block is constant valued 0 for all channels + 'rand' - erase block is valued random color (same per-channel) + 'pixel' - erase block is vauled random color per pixel + min_count: Minimum # of ereasing blocks per image. + max_count: Maximum # of ereasing blocks per image. Area per box is scaled by count + per-image count is randomly chosen between min_count to max_count + """ + def __init__(self, prob=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0): + self.prob = prob + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == "rand": + self.rand_color = True + elif mode == "pixel": + self.per_pixel = True + else: + assert not mode or mode == "const" + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.prob: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top+h, left:left+w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype) + break + + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) + else: + batch_size, chan, img_h, img_w = input.shape + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input + + + +#def main(): +# re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='rand') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='const') +# #re = RandomErasing(prob=1.0, min_area=0.2, max_area=0.6, mode='pixel') +# import PIL.Image as Image +# import numpy as np +# paddle.set_device('cpu') +# img = paddle.to_tensor(np.asarray(Image.open('./lenna.png'))).astype('float32') +# img = img / 255.0 +# img = paddle.transpose(img, [2, 0, 1]) +# new_img = re(img) +# new_img = new_img * 255.0 +# new_img = paddle.transpose(new_img, [1, 2, 0]) +# new_img = new_img.cpu().numpy() +# new_img = Image.fromarray(new_img.astype('uint8')) +# new_img.save('./res.png') +# +# +# +#if __name__ == "__main__": +# main() diff --git a/image_classification/gMLP/transforms.py b/image_classification/gMLP/transforms.py new file mode 100644 index 00000000..5a046912 --- /dev/null +++ b/image_classification/gMLP/transforms.py @@ -0,0 +1,14 @@ +import random +import paddle +import paddle.nn +import paddle.vision.transforms as T + + +class RandomHorizontalFlip(): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, image): + if random.random() < self.p: + return T.hflip(image) + return image From cc0b342b5321fd6eaa3001e0b59338f3347405f7 Mon Sep 17 00:00:00 2001 From: xperzy Date: Tue, 11 Jan 2022 10:45:51 +0800 Subject: [PATCH 2/2] add num_classes in args --- image_classification/BoTNet/config.py | 2 ++ image_classification/BoTNet/main_multi_gpu.py | 1 + image_classification/BoTNet/main_single_gpu.py | 1 + image_classification/CSwin/config.py | 4 +++- image_classification/CSwin/main_multi_gpu.py | 1 + image_classification/CSwin/main_single_gpu.py | 1 + image_classification/CaiT/cait.py | 10 +++++++++- image_classification/CaiT/config.py | 4 +++- image_classification/CaiT/main_multi_gpu.py | 1 + image_classification/CaiT/main_single_gpu.py | 1 + image_classification/CrossViT/config.py | 2 ++ image_classification/CrossViT/main_multi_gpu.py | 1 + image_classification/CrossViT/main_single_gpu.py | 1 + image_classification/CvT/config.py | 2 ++ image_classification/CvT/main_multi_gpu.py | 1 + image_classification/CvT/main_single_gpu.py | 1 + image_classification/CycleMLP/config.py | 1 + image_classification/DeiT/config.py | 3 +++ image_classification/DeiT/main_multi_gpu.py | 1 + image_classification/DeiT/main_single_gpu.py | 1 + image_classification/HaloNet/config.py | 2 ++ image_classification/HaloNet/halonet.py | 3 +-- image_classification/HaloNet/main_multi_gpu.py | 1 + image_classification/HaloNet/main_single_gpu.py | 1 + image_classification/MobileViT/config.py | 2 ++ image_classification/MobileViT/main_multi_gpu.py | 1 + image_classification/MobileViT/main_single_gpu.py | 1 + image_classification/PVTv2/config.py | 4 +++- image_classification/PVTv2/main_multi_gpu.py | 1 + image_classification/PVTv2/main_single_gpu.py | 1 + image_classification/PiT/config.py | 2 ++ image_classification/PiT/main_multi_gpu.py | 1 + image_classification/PiT/main_single_gpu.py | 1 + image_classification/PoolFormer/config.py | 2 ++ image_classification/PoolFormer/main_multi_gpu.py | 1 + image_classification/PoolFormer/main_single_gpu.py | 1 + image_classification/Shuffle_Transformer/config.py | 2 ++ .../Shuffle_Transformer/main_multi_gpu.py | 1 + .../Shuffle_Transformer/main_single_gpu.py | 1 + image_classification/T2T_ViT/config.py | 2 ++ image_classification/T2T_ViT/main_multi_gpu.py | 1 + image_classification/T2T_ViT/main_single_gpu.py | 1 + image_classification/ViP/config.py | 2 ++ image_classification/ViP/main_multi_gpu.py | 1 + image_classification/ViP/main_single_gpu.py | 1 + image_classification/ViT/config.py | 9 ++++++--- image_classification/ViT/main_multi_gpu.py | 1 + image_classification/ViT/main_single_gpu.py | 1 + image_classification/XCiT/config.py | 4 +++- image_classification/XCiT/main_multi_gpu.py | 1 + image_classification/XCiT/main_single_gpu.py | 1 + 51 files changed, 84 insertions(+), 10 deletions(-) diff --git a/image_classification/BoTNet/config.py b/image_classification/BoTNet/config.py index 6ac2f51a..0c604ea3 100644 --- a/image_classification/BoTNet/config.py +++ b/image_classification/BoTNet/config.py @@ -131,6 +131,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/BoTNet/main_multi_gpu.py b/image_classification/BoTNet/main_multi_gpu.py index 33a239a0..d54afe8e 100644 --- a/image_classification/BoTNet/main_multi_gpu.py +++ b/image_classification/BoTNet/main_multi_gpu.py @@ -47,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/BoTNet/main_single_gpu.py b/image_classification/BoTNet/main_single_gpu.py index b5ec964d..4c2d7fd5 100644 --- a/image_classification/BoTNet/main_single_gpu.py +++ b/image_classification/BoTNet/main_single_gpu.py @@ -46,6 +46,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CSwin/config.py b/image_classification/CSwin/config.py index e959c57e..a11bd6e2 100644 --- a/image_classification/CSwin/config.py +++ b/image_classification/CSwin/config.py @@ -71,7 +71,7 @@ _C.TRAIN.WARMUP_START_LR = 1e-6 _C.TRAIN.END_LR = 1e-5 _C.TRAIN.GRAD_CLIP = None -_C.TRAIN.ACCUM_ITER = 2 +_C.TRAIN.ACCUM_ITER = 1 _C.TRAIN.MODEL_EMA = True _C.TRAIN.MODEL_EMA_DECAY = 0.99992 _C.TRAIN.LINEAR_SCALED_LR = None @@ -150,6 +150,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/CSwin/main_multi_gpu.py b/image_classification/CSwin/main_multi_gpu.py index 149c72c0..5c6bbec5 100644 --- a/image_classification/CSwin/main_multi_gpu.py +++ b/image_classification/CSwin/main_multi_gpu.py @@ -49,6 +49,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CSwin/main_single_gpu.py b/image_classification/CSwin/main_single_gpu.py index 772a9ffb..731bfc3d 100644 --- a/image_classification/CSwin/main_single_gpu.py +++ b/image_classification/CSwin/main_single_gpu.py @@ -48,6 +48,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CaiT/cait.py b/image_classification/CaiT/cait.py index a92f91ab..3fd982ab 100644 --- a/image_classification/CaiT/cait.py +++ b/image_classification/CaiT/cait.py @@ -545,10 +545,18 @@ def forward(self, x): def build_cait(config): """build cait model using config""" model = Cait(image_size=config.DATA.IMAGE_SIZE, + num_classes=config.MODEL.NUM_CLASSES, + in_channels=config.MODEL.TRANS.IN_CHANNELS, patch_size=config.MODEL.TRANS.PATCH_SIZE, embed_dim=config.MODEL.TRANS.EMBED_DIM, depth=config.MODEL.TRANS.DEPTH, num_heads=config.MODEL.TRANS.NUM_HEADS, mlp_ratio=config.MODEL.TRANS.MLP_RATIO, - qkv_bias=config.MODEL.TRANS.QKV_BIAS) + qkv_bias=config.MODEL.TRANS.QKV_BIAS, + dropout=config.MODEL.DROPOUT, + attention_dropout=config.MODEL.ATTENTION_DROPOUT, + droppath=config.MODEL.DROPPATH, + init_values=config.MODEL.TRANS.INIT_VALUES, + mlp_ratio_class_token=config.MODEL.TRANS.MLP_RATIO, + depth_token_only=config.MODEL.TRANS.DEPTH_TOKEN_ONLY): return model diff --git a/image_classification/CaiT/config.py b/image_classification/CaiT/config.py index 0e298229..99f4e221 100644 --- a/image_classification/CaiT/config.py +++ b/image_classification/CaiT/config.py @@ -59,7 +59,7 @@ _C.MODEL.TRANS.MLP_RATIO = 4.0 _C.MODEL.TRANS.NUM_HEADS = 4 _C.MODEL.TRANS.QKV_BIAS = True -_C.MODEL.TRANS.INIT_VALUES = 1e-5 +_C.MODEL.TRANS.INIT_VALUES = 1e-4 # training settings @@ -154,6 +154,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/CaiT/main_multi_gpu.py b/image_classification/CaiT/main_multi_gpu.py index b0a3b1af..f274d0df 100644 --- a/image_classification/CaiT/main_multi_gpu.py +++ b/image_classification/CaiT/main_multi_gpu.py @@ -49,6 +49,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CaiT/main_single_gpu.py b/image_classification/CaiT/main_single_gpu.py index a1cecfd9..6d1cb7a0 100644 --- a/image_classification/CaiT/main_single_gpu.py +++ b/image_classification/CaiT/main_single_gpu.py @@ -47,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CrossViT/config.py b/image_classification/CrossViT/config.py index 133eb6f4..b1c51aa4 100644 --- a/image_classification/CrossViT/config.py +++ b/image_classification/CrossViT/config.py @@ -158,6 +158,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/CrossViT/main_multi_gpu.py b/image_classification/CrossViT/main_multi_gpu.py index c46acc2c..73932db0 100644 --- a/image_classification/CrossViT/main_multi_gpu.py +++ b/image_classification/CrossViT/main_multi_gpu.py @@ -49,6 +49,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CrossViT/main_single_gpu.py b/image_classification/CrossViT/main_single_gpu.py index 6638172a..c8ad8bc4 100644 --- a/image_classification/CrossViT/main_single_gpu.py +++ b/image_classification/CrossViT/main_single_gpu.py @@ -50,6 +50,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CvT/config.py b/image_classification/CvT/config.py index ae43eb57..a1f57199 100644 --- a/image_classification/CvT/config.py +++ b/image_classification/CvT/config.py @@ -146,6 +146,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/CvT/main_multi_gpu.py b/image_classification/CvT/main_multi_gpu.py index d1eef8cf..dc1ed0d9 100644 --- a/image_classification/CvT/main_multi_gpu.py +++ b/image_classification/CvT/main_multi_gpu.py @@ -48,6 +48,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CvT/main_single_gpu.py b/image_classification/CvT/main_single_gpu.py index a1ef185d..2a858a29 100644 --- a/image_classification/CvT/main_single_gpu.py +++ b/image_classification/CvT/main_single_gpu.py @@ -47,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/CycleMLP/config.py b/image_classification/CycleMLP/config.py index 99754c68..a0b94cb2 100644 --- a/image_classification/CycleMLP/config.py +++ b/image_classification/CycleMLP/config.py @@ -126,6 +126,7 @@ def _update_config_from_file(config, cfg_file): config.merge_from_file(cfg_file) config.freeze() + def update_config(config, args): """Update config by ArgumentParser Args: diff --git a/image_classification/DeiT/config.py b/image_classification/DeiT/config.py index 4b023f60..799a614b 100644 --- a/image_classification/DeiT/config.py +++ b/image_classification/DeiT/config.py @@ -31,6 +31,7 @@ _C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset _C.DATA.DATASET = 'imagenet2012' # dataset name _C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune +_C.DATA.IMAGE_CHANNELS = 3 # input image channels _C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode _C.DATA.NUM_WORKERS = 1 # number of data loading threads _C.DATA.IMAGENET_MEAN = [0.485, 0.456, 0.406] # [0.5, 0.5, 0.5] @@ -151,6 +152,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/DeiT/main_multi_gpu.py b/image_classification/DeiT/main_multi_gpu.py index d3502a2f..1dab0690 100644 --- a/image_classification/DeiT/main_multi_gpu.py +++ b/image_classification/DeiT/main_multi_gpu.py @@ -51,6 +51,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/DeiT/main_single_gpu.py b/image_classification/DeiT/main_single_gpu.py index 6db48969..5ea51051 100644 --- a/image_classification/DeiT/main_single_gpu.py +++ b/image_classification/DeiT/main_single_gpu.py @@ -51,6 +51,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/HaloNet/config.py b/image_classification/HaloNet/config.py index f01d9058..9b33c1f4 100755 --- a/image_classification/HaloNet/config.py +++ b/image_classification/HaloNet/config.py @@ -150,6 +150,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/HaloNet/halonet.py b/image_classification/HaloNet/halonet.py index 0f736b99..78b75c52 100755 --- a/image_classification/HaloNet/halonet.py +++ b/image_classification/HaloNet/halonet.py @@ -663,8 +663,7 @@ def build_halonet(config): :param config: config instance contains setting options :return: HaloNet model """ - model = HaloNet( - depth_list=config.MODEL.DEPTH, + model = HaloNet(depth_list=config.MODEL.DEPTH, stage1_block=config.MODEL.STAGE1_BLOCK, stage2_block=config.MODEL.STAGE2_BLOCK, stage3_block=config.MODEL.STAGE3_BLOCK, diff --git a/image_classification/HaloNet/main_multi_gpu.py b/image_classification/HaloNet/main_multi_gpu.py index 391a428c..6090bc78 100755 --- a/image_classification/HaloNet/main_multi_gpu.py +++ b/image_classification/HaloNet/main_multi_gpu.py @@ -48,6 +48,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/HaloNet/main_single_gpu.py b/image_classification/HaloNet/main_single_gpu.py index c8f48004..0edb7149 100755 --- a/image_classification/HaloNet/main_single_gpu.py +++ b/image_classification/HaloNet/main_single_gpu.py @@ -47,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/MobileViT/config.py b/image_classification/MobileViT/config.py index d699e160..4bc7c431 100644 --- a/image_classification/MobileViT/config.py +++ b/image_classification/MobileViT/config.py @@ -161,6 +161,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/MobileViT/main_multi_gpu.py b/image_classification/MobileViT/main_multi_gpu.py index d51022fb..472e0392 100644 --- a/image_classification/MobileViT/main_multi_gpu.py +++ b/image_classification/MobileViT/main_multi_gpu.py @@ -50,6 +50,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/MobileViT/main_single_gpu.py b/image_classification/MobileViT/main_single_gpu.py index 5b5bd2b1..9d4e95e9 100644 --- a/image_classification/MobileViT/main_single_gpu.py +++ b/image_classification/MobileViT/main_single_gpu.py @@ -49,6 +49,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/PVTv2/config.py b/image_classification/PVTv2/config.py index 18b609b8..f816565c 100644 --- a/image_classification/PVTv2/config.py +++ b/image_classification/PVTv2/config.py @@ -61,7 +61,7 @@ _C.MODEL.TRANS.SR_RATIO = [8, 4, 2, 1] _C.MODEL.TRANS.QKV_BIAS = True _C.MODEL.TRANS.QK_SCALE = None -_C.MODEL.TRANS.LINEAR = False +_C.MODEL.TRANS.LINEAR = None # training settings _C.TRAIN = CN() @@ -150,6 +150,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/PVTv2/main_multi_gpu.py b/image_classification/PVTv2/main_multi_gpu.py index 4cecef6b..ee32f276 100644 --- a/image_classification/PVTv2/main_multi_gpu.py +++ b/image_classification/PVTv2/main_multi_gpu.py @@ -50,6 +50,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/PVTv2/main_single_gpu.py b/image_classification/PVTv2/main_single_gpu.py index 0f6967ba..0b282077 100644 --- a/image_classification/PVTv2/main_single_gpu.py +++ b/image_classification/PVTv2/main_single_gpu.py @@ -49,6 +49,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/PiT/config.py b/image_classification/PiT/config.py index 051fe2ad..4aa9674e 100644 --- a/image_classification/PiT/config.py +++ b/image_classification/PiT/config.py @@ -152,6 +152,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/PiT/main_multi_gpu.py b/image_classification/PiT/main_multi_gpu.py index ea327f22..1da841b2 100644 --- a/image_classification/PiT/main_multi_gpu.py +++ b/image_classification/PiT/main_multi_gpu.py @@ -51,6 +51,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/PiT/main_single_gpu.py b/image_classification/PiT/main_single_gpu.py index 67ba96d6..3126bac7 100644 --- a/image_classification/PiT/main_single_gpu.py +++ b/image_classification/PiT/main_single_gpu.py @@ -51,6 +51,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/PoolFormer/config.py b/image_classification/PoolFormer/config.py index 551c2114..954629c5 100644 --- a/image_classification/PoolFormer/config.py +++ b/image_classification/PoolFormer/config.py @@ -147,6 +147,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/PoolFormer/main_multi_gpu.py b/image_classification/PoolFormer/main_multi_gpu.py index cb4e2de3..3e81aa25 100644 --- a/image_classification/PoolFormer/main_multi_gpu.py +++ b/image_classification/PoolFormer/main_multi_gpu.py @@ -46,6 +46,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/PoolFormer/main_single_gpu.py b/image_classification/PoolFormer/main_single_gpu.py index 71a4fcf8..69022755 100644 --- a/image_classification/PoolFormer/main_single_gpu.py +++ b/image_classification/PoolFormer/main_single_gpu.py @@ -46,6 +46,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/Shuffle_Transformer/config.py b/image_classification/Shuffle_Transformer/config.py index 8910dcdf..55931dcd 100644 --- a/image_classification/Shuffle_Transformer/config.py +++ b/image_classification/Shuffle_Transformer/config.py @@ -150,6 +150,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/Shuffle_Transformer/main_multi_gpu.py b/image_classification/Shuffle_Transformer/main_multi_gpu.py index 28b17520..890d2ada 100644 --- a/image_classification/Shuffle_Transformer/main_multi_gpu.py +++ b/image_classification/Shuffle_Transformer/main_multi_gpu.py @@ -48,6 +48,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/Shuffle_Transformer/main_single_gpu.py b/image_classification/Shuffle_Transformer/main_single_gpu.py index 87fb4feb..c21f55e2 100644 --- a/image_classification/Shuffle_Transformer/main_single_gpu.py +++ b/image_classification/Shuffle_Transformer/main_single_gpu.py @@ -48,6 +48,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/T2T_ViT/config.py b/image_classification/T2T_ViT/config.py index c506e56c..c4eba120 100644 --- a/image_classification/T2T_ViT/config.py +++ b/image_classification/T2T_ViT/config.py @@ -147,6 +147,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/T2T_ViT/main_multi_gpu.py b/image_classification/T2T_ViT/main_multi_gpu.py index 4e53882c..59719268 100644 --- a/image_classification/T2T_ViT/main_multi_gpu.py +++ b/image_classification/T2T_ViT/main_multi_gpu.py @@ -50,6 +50,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/T2T_ViT/main_single_gpu.py b/image_classification/T2T_ViT/main_single_gpu.py index 4ba9753b..4c68fcef 100644 --- a/image_classification/T2T_ViT/main_single_gpu.py +++ b/image_classification/T2T_ViT/main_single_gpu.py @@ -50,6 +50,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/ViP/config.py b/image_classification/ViP/config.py index e4b47b1c..fd6f36a7 100644 --- a/image_classification/ViP/config.py +++ b/image_classification/ViP/config.py @@ -143,6 +143,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/ViP/main_multi_gpu.py b/image_classification/ViP/main_multi_gpu.py index 30384fc0..6bd4fdf3 100644 --- a/image_classification/ViP/main_multi_gpu.py +++ b/image_classification/ViP/main_multi_gpu.py @@ -49,6 +49,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/ViP/main_single_gpu.py b/image_classification/ViP/main_single_gpu.py index dc88e244..3c0e7de0 100644 --- a/image_classification/ViP/main_single_gpu.py +++ b/image_classification/ViP/main_single_gpu.py @@ -49,6 +49,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/ViT/config.py b/image_classification/ViT/config.py index 67eba161..8ab05bce 100644 --- a/image_classification/ViT/config.py +++ b/image_classification/ViT/config.py @@ -85,9 +85,9 @@ # misc _C.SAVE = "./output" _C.TAG = "default" -_C.SAVE_FREQ = 10 # freq to save chpt -_C.REPORT_FREQ = 100 # freq to logging info -_C.VALIDATE_FREQ = 100 # freq to do validation +_C.SAVE_FREQ = 1 # freq to save chpt +_C.REPORT_FREQ = 20 # freq to logging info +_C.VALIDATE_FREQ = 20 # freq to do validation _C.SEED = 0 # random seed for paddle, numpy and python _C.EVAL = False # run evaluation only _C.AMP = False # mix precision training @@ -125,6 +125,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: @@ -145,6 +147,7 @@ def update_config(config, args): config.AMP = False else: config.AMP = True + #config.freeze() return config diff --git a/image_classification/ViT/main_multi_gpu.py b/image_classification/ViT/main_multi_gpu.py index 5377e0fc..fc61db3c 100644 --- a/image_classification/ViT/main_multi_gpu.py +++ b/image_classification/ViT/main_multi_gpu.py @@ -44,6 +44,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/ViT/main_single_gpu.py b/image_classification/ViT/main_single_gpu.py index 692e703f..a2f26781 100644 --- a/image_classification/ViT/main_single_gpu.py +++ b/image_classification/ViT/main_single_gpu.py @@ -43,6 +43,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/XCiT/config.py b/image_classification/XCiT/config.py index dddb4f1a..2be81d34 100644 --- a/image_classification/XCiT/config.py +++ b/image_classification/XCiT/config.py @@ -69,7 +69,7 @@ _C.TRAIN.WARMUP_START_LR = 0.0 _C.TRAIN.END_LR = 0.0 _C.TRAIN.GRAD_CLIP = 1.0 -_C.TRAIN.ACCUM_ITER = 2 +_C.TRAIN.ACCUM_ITER = 1 _C.TRAIN.LINEAR_SCALED_LR = None _C.TRAIN.LR_SCHEDULER = CN() @@ -145,6 +145,8 @@ def update_config(config, args): config.DATA.BATCH_SIZE = args.batch_size if args.image_size: config.DATA.IMAGE_SIZE = args.image_size + if args.num_classes: + config.MODEL.NUM_CLASSES = args.num_classes if args.data_path: config.DATA.DATA_PATH = args.data_path if args.output is not None: diff --git a/image_classification/XCiT/main_multi_gpu.py b/image_classification/XCiT/main_multi_gpu.py index b817e9cf..7e27131f 100644 --- a/image_classification/XCiT/main_multi_gpu.py +++ b/image_classification/XCiT/main_multi_gpu.py @@ -48,6 +48,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None) diff --git a/image_classification/XCiT/main_single_gpu.py b/image_classification/XCiT/main_single_gpu.py index 00aa8821..625ed202 100644 --- a/image_classification/XCiT/main_single_gpu.py +++ b/image_classification/XCiT/main_single_gpu.py @@ -47,6 +47,7 @@ def get_arguments(): parser.add_argument('-data_path', type=str, default=None) parser.add_argument('-output', type=str, default=None) parser.add_argument('-ngpus', type=int, default=None) + parser.add_argument('-num_classes', type=int, default=None) parser.add_argument('-pretrained', type=str, default=None) parser.add_argument('-resume', type=str, default=None) parser.add_argument('-last_epoch', type=int, default=None)