From 3be34bad1134fe24ae897298793dfea1ce7b3c76 Mon Sep 17 00:00:00 2001 From: LutingWang <2457348692@qq.com> Date: Thu, 14 Mar 2024 12:20:11 +0800 Subject: [PATCH] feat: define holder mixin --- todd/runners/callbacks/checkpoint.py | 18 +- todd/runners/callbacks/composed.py | 2 +- todd/runners/callbacks/git.py | 6 +- todd/runners/callbacks/interval.py | 2 +- todd/runners/callbacks/log.py | 16 +- todd/runners/callbacks/lr.py | 8 +- todd/runners/callbacks/monitor.py | 4 +- todd/runners/callbacks/tensorboard.py | 2 +- todd/runners/epoch_based_trainer.py | 2 +- todd/runners/iter_based_trainer.py | 2 +- todd/runners/strategies/base.py | 10 +- todd/runners/strategies/ddp.py | 2 +- todd/runners/strategies/fsdp.py | 8 +- todd/runners/utils.py | 47 +- todd/utils/__init__.py | 1 + todd/utils/constants.py | 11 + todd/utils/mixins.py | 16 +- tutorials/runners.ipynb | 1244 +++++++++++++++---------- 18 files changed, 833 insertions(+), 568 deletions(-) create mode 100644 todd/utils/constants.py diff --git a/todd/runners/callbacks/checkpoint.py b/todd/runners/callbacks/checkpoint.py index 09548a21..caba779a 100644 --- a/todd/runners/callbacks/checkpoint.py +++ b/todd/runners/callbacks/checkpoint.py @@ -36,27 +36,27 @@ def __init__( def init(self, *args, **kwargs) -> None: super().init(*args, **kwargs) - self._checkpoint_dir = self._runner.work_dir / 'checkpoints' + self._checkpoint_dir = self.runner.work_dir / 'checkpoints' self._latest_checkpoint_dir = self._checkpoint_dir / 'latest' self._checkpoint_dir.mkdir(parents=True, exist_ok=True) - if self._runner._auto_resume and self._latest_checkpoint_dir.exists(): + if self.runner._auto_resume and self._latest_checkpoint_dir.exists(): load_from = self._latest_checkpoint_dir - elif self._runner.load_from is not None: - load_from = pathlib.Path(self._runner.load_from) + elif self.runner.load_from is not None: + load_from = pathlib.Path(self.runner.load_from) assert load_from.exists() else: load_from = None if load_from is not None: if get_rank() == 0: - self._runner.logger.info("Loading from %s", load_from) + self.runner.logger.info("Loading from %s", load_from) state_dict = { f.stem: torch.load(f, 'cpu') for f in load_from.glob('*.pth') } - self._runner.load_state_dict(state_dict, **self._load_state_dict) + self.runner.load_state_dict(state_dict, **self._load_state_dict) @property def checkpoint_dir(self) -> pathlib.Path: @@ -71,13 +71,13 @@ def _work_dir(self, name: str) -> pathlib.Path: def _save(self, name: str) -> None: # for FSDP, all ranks should call state dict - state_dict = self._runner.state_dict(**self._state_dict) + state_dict = self.runner.state_dict(**self._state_dict) if get_rank() != 0: return work_dir = self._work_dir(name) work_dir.mkdir(parents=True, exist_ok=True) - self._runner.logger.info("Saving state dict to %s", work_dir) + self.runner.logger.info("Saving state dict to %s", work_dir) for k, v in state_dict.items(): torch.save(v, work_dir / f'{k}.pth') @@ -88,7 +88,7 @@ def _save(self, name: str) -> None: def after_run_iter(self, batch, memo: Memo) -> None: super().after_run_iter(batch, memo) if self._should_run_iter(): - self._save(f'iter_{self._runner.iter_}') + self._save(f'iter_{self.runner.iter_}') def after_run_epoch(self, epoch_memo: Memo, memo: Memo) -> None: super().after_run_epoch(epoch_memo, memo) diff --git a/todd/runners/callbacks/composed.py b/todd/runners/callbacks/composed.py index a1c1638d..83bec29b 100644 --- a/todd/runners/callbacks/composed.py +++ b/todd/runners/callbacks/composed.py @@ -21,7 +21,7 @@ def __init__(self, *args, callbacks: Iterable[Config], **kwargs) -> None: super().__init__(*args, **kwargs) priorities = [c.pop('priority', dict()) for c in callbacks] queue = [ - CallbackRegistry.build(c, runner=self._runner) for c in callbacks + CallbackRegistry.build(c, runner=self.runner) for c in callbacks ] self._priority_queue: PriorityQueue[KT, BaseCallback] = \ PriorityQueue(priorities, queue) diff --git a/todd/runners/callbacks/git.py b/todd/runners/callbacks/git.py index ce642acd..ab90b0fe 100644 --- a/todd/runners/callbacks/git.py +++ b/todd/runners/callbacks/git.py @@ -33,10 +33,10 @@ def init(self, *args, **kwargs) -> None: diff = subprocess_run(args_) except subprocess.CalledProcessError as e: diff = str(e) - self._runner.logger.error(e) + self.runner.logger.error(e) else: file = ( - self._runner.work_dir / f'git_diff_{get_timestamp()}.log' + self.runner.work_dir / f'git_diff_{get_timestamp()}.log' ) - self._runner.logger.info('Saving git diff to %s', file) + self.runner.logger.info('Saving git diff to %s', file) file.write_text(diff) diff --git a/todd/runners/callbacks/interval.py b/todd/runners/callbacks/interval.py index 8eb9625d..489dd21f 100644 --- a/todd/runners/callbacks/interval.py +++ b/todd/runners/callbacks/interval.py @@ -22,7 +22,7 @@ def __should_run(self, step: int) -> bool: return self._interval > 0 and step % self._interval == 0 def _should_run_iter(self) -> bool: - return not self._by_epoch and self.__should_run(self._runner.iter_) + return not self._by_epoch and self.__should_run(self.runner.iter_) def _should_run_epoch(self) -> bool: return ( diff --git a/todd/runners/callbacks/log.py b/todd/runners/callbacks/log.py index 8f884f54..263e6b45 100644 --- a/todd/runners/callbacks/log.py +++ b/todd/runners/callbacks/log.py @@ -43,24 +43,24 @@ def init(self, *args, **kwargs) -> None: if get_rank() > 0: return if self._with_file_handler: - file = self._runner.work_dir / f'{get_timestamp()}.log' + file = self.runner.work_dir / f'{get_timestamp()}.log' handler = logging.FileHandler(file) handler.setFormatter(Formatter()) - self._runner.logger.addHandler(handler) + self.runner.logger.addHandler(handler) if self._collect_env is not None: from ...base import ( # noqa: E501 pylint: disable=import-outside-toplevel collect_env, ) env = collect_env(**self._collect_env) - self._runner.logger.info(env) + self.runner.logger.info(env) def before_run(self, memo: Memo) -> None: super().before_run(memo) self._eta: BaseETA | None = ( None if self._eta_config is None else ETARegistry.build( self._eta_config, - start=self._runner.iter_ - 1, - end=self._runner.iters, + start=self.runner.iter_ - 1, + end=self.runner.iters, ) ) @@ -73,10 +73,10 @@ def after_run_iter(self, batch, memo: Memo) -> None: super().after_run_iter(batch, memo) if 'log' not in memo: return - prefix = f"Iter [{self._runner.iter_}/{self._runner.iters}] " + prefix = f"Iter [{self.runner.iter_}/{self.runner.iters}] " if self._eta is not None: - eta = self._eta(self._runner.iter_) + eta = self._eta(self.runner.iter_) eta = round(eta) prefix += f"ETA {str(datetime.timedelta(seconds=eta))} " @@ -90,7 +90,7 @@ def after_run_iter(self, batch, memo: Memo) -> None: log: dict[str, Any] = memo.pop('log') message = ' '.join(f'{k}={v}' for k, v in log.items() if v is not None) - self._runner.logger.info(prefix + message) + self.runner.logger.info(prefix + message) def before_run_epoch(self, epoch_memo: Memo, memo: Memo) -> None: super().before_run_epoch(epoch_memo, memo) diff --git a/todd/runners/callbacks/lr.py b/todd/runners/callbacks/lr.py index 977558b1..6a597c9a 100644 --- a/todd/runners/callbacks/lr.py +++ b/todd/runners/callbacks/lr.py @@ -26,7 +26,7 @@ def __init__( **kwargs, ) -> None: super().__init__(*args, interval=interval, **kwargs) - assert isinstance(self._runner, Trainer) + assert isinstance(self.runner, Trainer) self._lr_scheduler_config = lr_scheduler def init(self, *args, **kwargs) -> None: @@ -34,7 +34,7 @@ def init(self, *args, **kwargs) -> None: self._build_lr_scheduler() def _build_lr_scheduler(self) -> None: - runner = cast(Trainer, self._runner) + runner = cast(Trainer, self.runner) self._lr_scheduler: torch.optim.lr_scheduler.LRScheduler = \ LRSchedulerRegistry.build( self._lr_scheduler_config, @@ -75,11 +75,11 @@ class LRScaleCallback(BaseCallback): def __init__(self, *args, lr_scaler: Config, **kwargs) -> None: super().__init__(*args, **kwargs) - assert isinstance(self._runner, Trainer) + assert isinstance(self.runner, Trainer) self._lr_scaler_config = lr_scaler def _scale_lr(self, config: Config) -> None: - runner = cast(Trainer, self._runner) + runner = cast(Trainer, self.runner) assert runner.dataloader.batch_size is not None base_batch_size = config.base_batch_size batch_size = get_world_size() * runner.dataloader.batch_size diff --git a/todd/runners/callbacks/monitor.py b/todd/runners/callbacks/monitor.py index 98aabf2b..08954774 100644 --- a/todd/runners/callbacks/monitor.py +++ b/todd/runners/callbacks/monitor.py @@ -37,8 +37,8 @@ def run_iter_context( ) -> None: super().run_iter_context(exit_stack, batch, memo) context = Context( - self._runner.logger, - iter_=self._runner.iter_, + self.runner.logger, + iter_=self.runner.iter_, batch=batch, memo=memo, ) diff --git a/todd/runners/callbacks/tensorboard.py b/todd/runners/callbacks/tensorboard.py index 65eb9bc5..804f907e 100644 --- a/todd/runners/callbacks/tensorboard.py +++ b/todd/runners/callbacks/tensorboard.py @@ -31,7 +31,7 @@ def init(self, *args, **kwargs) -> None: super().init(*args, **kwargs) if get_rank() > 0: return - log_dir = self._runner.work_dir / 'tensorboard' + log_dir = self.runner.work_dir / 'tensorboard' self._summary_writer = SummaryWriter( log_dir, **self._summary_writer_config, diff --git a/todd/runners/epoch_based_trainer.py b/todd/runners/epoch_based_trainer.py index bc463465..8358d003 100644 --- a/todd/runners/epoch_based_trainer.py +++ b/todd/runners/epoch_based_trainer.py @@ -18,7 +18,7 @@ @RunnerRegistry.register_() -class EpochBasedTrainer(Trainer): +class EpochBasedTrainer(Trainer[T]): def __init__(self, *args, epochs: int, **kwargs) -> None: super().__init__(*args, **kwargs) diff --git a/todd/runners/iter_based_trainer.py b/todd/runners/iter_based_trainer.py index 107c894c..7a212fa6 100644 --- a/todd/runners/iter_based_trainer.py +++ b/todd/runners/iter_based_trainer.py @@ -16,7 +16,7 @@ @RunnerRegistry.register_() -class IterBasedTrainer(Trainer): +class IterBasedTrainer(Trainer[T]): def __init__(self, *args, iters: int, **kwargs) -> None: super().__init__(*args, **kwargs) diff --git a/todd/runners/strategies/base.py b/todd/runners/strategies/base.py index a97d30fb..d6c2a748 100644 --- a/todd/runners/strategies/base.py +++ b/todd/runners/strategies/base.py @@ -2,7 +2,7 @@ 'BaseStrategy', ] -from typing import Any, Generic, Mapping, TypeVar, cast +from typing import Any, Mapping, TypeVar, cast import torch from torch import nn @@ -15,7 +15,7 @@ @StrategyRegistry.register_() -class BaseStrategy(RunnerHolderMixin, StateDictMixin, Generic[T]): +class BaseStrategy(RunnerHolderMixin[T], StateDictMixin): def __init__( self, @@ -45,7 +45,7 @@ def build_optimizer(self, config: Config) -> torch.optim.Optimizer: @property def module(self) -> nn.Module: - return self._runner.model + return self.runner.model def model_state_dict(self, *args, **kwargs) -> dict[str, Any]: return self.module.state_dict(*args, **kwargs) @@ -62,7 +62,7 @@ def load_model_state_dict( **kwargs, ) if get_rank() == 0: - self._runner.logger.info(incompatible_keys) + self.runner.logger.info(incompatible_keys) def load_model_from( self, @@ -77,7 +77,7 @@ def load_model_from( model_state_dict = dict() for f_ in f_list: if get_rank() == 0: - self._runner.logger.info("Loading model from %s", f_) + self.runner.logger.info("Loading model from %s", f_) model_state_dict.update(torch.load(f_, 'cpu')) self.load_model_state_dict(model_state_dict, *args, **kwargs) diff --git a/todd/runners/strategies/ddp.py b/todd/runners/strategies/ddp.py index 3367c2ab..e2e9fd67 100644 --- a/todd/runners/strategies/ddp.py +++ b/todd/runners/strategies/ddp.py @@ -23,4 +23,4 @@ def wrap_model(self, model: nn.Module, config: Config) -> T: @property def module(self) -> nn.Module: - return self._runner.model.module + return self.runner.model.module diff --git a/todd/runners/strategies/fsdp.py b/todd/runners/strategies/fsdp.py index fa71105a..42eb809d 100644 --- a/todd/runners/strategies/fsdp.py +++ b/todd/runners/strategies/fsdp.py @@ -26,13 +26,13 @@ def wrap_model(self, model: nn.Module, config: Config) -> T: @property def module(self) -> nn.Module: - return self._runner.model.module + return self.runner.model.module def build_optimizer(self, config: Config) -> torch.optim.Optimizer: - return OptimizerRegistry.build(config, model=self._runner.model) + return OptimizerRegistry.build(config, model=self.runner.model) def model_state_dict(self, *args, **kwargs) -> dict[str, Any]: - return self._runner.model.state_dict(*args, **kwargs) + return self.runner.model.state_dict(*args, **kwargs) def load_model_state_dict( self, @@ -40,7 +40,7 @@ def load_model_state_dict( *args, **kwargs, ) -> None: - self._runner.model.load_state_dict(state_dict, *args, **kwargs) + self.runner.model.load_state_dict(state_dict, *args, **kwargs) def optim_state_dict( self, diff --git a/todd/runners/utils.py b/todd/runners/utils.py index 0f257830..9c726054 100644 --- a/todd/runners/utils.py +++ b/todd/runners/utils.py @@ -2,42 +2,45 @@ 'RunnerHolderMixin', ] -import weakref -from typing import cast +from typing import TypeVar +from torch import nn + +from ..utils import HolderMixin from .base import BaseRunner from .epoch_based_trainer import EpochBasedTrainer from .iter_based_trainer import IterBasedTrainer from .trainer import Trainer from .validator import Validator +T = TypeVar('T', bound=nn.Module) + -class RunnerHolderMixin: +class RunnerHolderMixin(HolderMixin[BaseRunner[T]]): - def __init__(self, *args, runner: BaseRunner, **kwargs) -> None: - super().__init__(*args, **kwargs) - runner_proxy = ( - runner if isinstance(runner, weakref.ProxyTypes) else - weakref.proxy(runner) - ) - self._runner = cast(BaseRunner, runner_proxy) + def __init__(self, *args, runner: BaseRunner[T], **kwargs) -> None: + super().__init__(*args, instance=runner, **kwargs) + + @property + def runner(self) -> BaseRunner[T]: + return self._instance @property - def trainer(self) -> Trainer: - assert isinstance(self._runner, Trainer) - return self._runner + def trainer(self) -> Trainer[T]: + assert isinstance(self._instance, Trainer) + return self._instance @property - def validator(self) -> Validator: - assert isinstance(self._runner, Validator) - return self._runner + def validator(self) -> Validator[T]: + assert isinstance(self._instance, Validator) + return self._instance @property - def iter_based_trainer(self) -> IterBasedTrainer: - assert isinstance(self._runner, IterBasedTrainer) - return self._runner + def iter_based_trainer(self) -> IterBasedTrainer[T]: + assert isinstance(self._instance, IterBasedTrainer) + return self._instance @property - def epoch_based_trainer(self) -> EpochBasedTrainer: - assert isinstance(self._runner, EpochBasedTrainer) - return self._runner + def epoch_based_trainer(self) -> EpochBasedTrainer[T]: + assert isinstance(self._instance, EpochBasedTrainer) + return self._instance diff --git a/todd/utils/__init__.py b/todd/utils/__init__.py index 85d1d65f..199b7dfb 100644 --- a/todd/utils/__init__.py +++ b/todd/utils/__init__.py @@ -1,3 +1,4 @@ +from .constants import * from .enums import * from .generic_tensors import * from .metas import * diff --git a/todd/utils/constants.py b/todd/utils/constants.py new file mode 100644 index 00000000..e225d6e2 --- /dev/null +++ b/todd/utils/constants.py @@ -0,0 +1,11 @@ +__all__ = [ + 'IMAGENET_MEAN', + 'IMAGENET_STD', + 'IMAGENET_MEAN_255', + 'IMAGENET_STD_255', +] + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) +IMAGENET_MEAN_255 = tuple(x * 255 for x in IMAGENET_MEAN) +IMAGENET_STD_255 = tuple(x * 255 for x in IMAGENET_STD) diff --git a/todd/utils/mixins.py b/todd/utils/mixins.py index 9574ea16..3e9a3ccb 100644 --- a/todd/utils/mixins.py +++ b/todd/utils/mixins.py @@ -1,8 +1,12 @@ __all__ = [ 'StateDictMixin', + 'HolderMixin', ] -from typing import Any, Mapping +import weakref +from typing import Any, Generic, Mapping, TypeVar, cast + +T = TypeVar('T') class StateDictMixin: @@ -19,4 +23,12 @@ def load_state_dict( pass -# TODO: define holder mixin +class HolderMixin(Generic[T]): + + def __init__(self, *args, instance: T, **kwargs) -> None: + super().__init__(*args, **kwargs) + instance_proxy = ( + instance if isinstance(instance, weakref.ProxyTypes) else + weakref.proxy(instance) + ) + self._instance = cast(T, instance_proxy) diff --git a/tutorials/runners.ipynb b/tutorials/runners.ipynb index 4975962e..c64e02d6 100644 --- a/tutorials/runners.ipynb +++ b/tutorials/runners.ipynb @@ -42,7 +42,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:26:49,338 35174:140704541179520][patches.py:9 todd ] INFO: `ipdb` is installed. Using it for debugging.\n", + "[2024-03-14 12:17:47,682 62058:140704275689088][patches.py:9 todd ] INFO: `ipdb` is installed. Using it for debugging.\n", "/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/mmcv/__init__.py:20: UserWarning: On January 1, 2023, MMCV will release v2.0.0, in which it will remove components related to the training process and add a data transformation module. In addition, it will rename the package names mmcv to mmcv-lite and mmcv-full to mmcv. See https://github.com/open-mmlab/mmcv/blob/master/docs/en/compatibility.md for more details.\n", " warnings.warn(\n" ] @@ -184,7 +184,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:26:50,988 35174:140704541179520][base.py:57 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n" + "\u001b[2m[2024-03-14 12:17:49,375 62058:140704275689088][base.py:56 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n" ] }, { @@ -192,7 +192,7 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpl9zy4w3o\u001b[0m\n", + "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpjms97dk6\u001b[0m\n", "└── \u001b[1;36mvalidator\u001b[0m\n", "\n", "2 directories, 0 files\n" @@ -226,11 +226,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:26:51,346 35174:140704541179520][base.py:57 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:26:51,388 35174:140704541179520][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:26:51,392 35174:140704541179520][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", - "[2024-02-23 18:26:51,395 35174:140704541179520][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", - "[2024-02-23 18:26:51,398 35174:140704541179520][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" + "\u001b[2m[2024-03-14 12:17:49,723 62058:140704275689088][base.py:56 todd.Validator.validator __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:17:49,728 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:17:49,731 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", + "[2024-03-14 12:17:49,734 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", + "[2024-03-14 12:17:49,736 62058:140704275689088][log.py:93 todd.Validator.validator after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" ] }, { @@ -238,7 +238,7 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpy_xrnx2v\u001b[0m\n", + "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpv4q6cgrv\u001b[0m\n", "└── \u001b[1;36mvalidator\u001b[0m\n", "\n", "2 directories, 0 files\n" @@ -288,15 +288,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:26:51,739 35174:140704541179520][base.py:57 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:26:51,743 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [1/8] batch={'x': tensor([1, 3]), 'y': tensor([2, 6])} weight=0.000 loss=4.000\n", - "[2024-02-23 18:26:51,744 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [2/8] batch={'x': tensor([5, 8]), 'y': tensor([10, 16])} weight=0.000 loss=13.000\n", - "[2024-02-23 18:26:51,745 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [3/8] batch={'x': tensor([4, 2]), 'y': tensor([8, 4])} weight=0.000 loss=6.000\n", - "[2024-02-23 18:26:51,748 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [4/8] batch={'x': tensor([ 6, 10]), 'y': tensor([12, 20])} weight=0.000 loss=16.000\n", - "[2024-02-23 18:26:51,749 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [5/8] batch={'x': tensor([7, 9]), 'y': tensor([14, 18])} weight=0.000 loss=16.000\n", - "[2024-02-23 18:26:51,751 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [6/8] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.000 loss=12.000\n", - "[2024-02-23 18:26:51,753 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [7/8] batch={'x': tensor([3, 4]), 'y': tensor([6, 8])} weight=0.000 loss=7.000\n", - "[2024-02-23 18:26:51,755 35174:140704541179520][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [8/8] batch={'x': tensor([2, 6]), 'y': tensor([ 4, 12])} weight=0.000 loss=8.000\n" + "\u001b[2m[2024-03-14 12:17:50,032 62058:140704275689088][base.py:56 todd.IterBasedTrainer.iter_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:17:50,035 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [1/8] batch={'x': tensor([ 7, 10]), 'y': tensor([14, 20])} weight=0.000 loss=17.000\n", + "[2024-03-14 12:17:50,037 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [2/8] batch={'x': tensor([4, 6]), 'y': tensor([ 8, 12])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:17:50,038 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [3/8] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.000 loss=11.000\n", + "[2024-03-14 12:17:50,039 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [4/8] batch={'x': tensor([9, 2]), 'y': tensor([18, 4])} weight=0.000 loss=11.000\n", + "[2024-03-14 12:17:50,041 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [5/8] batch={'x': tensor([5, 1]), 'y': tensor([10, 2])} weight=0.000 loss=6.000\n", + "[2024-03-14 12:17:50,043 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [6/8] batch={'x': tensor([8, 1]), 'y': tensor([16, 2])} weight=0.000 loss=9.000\n", + "[2024-03-14 12:17:50,044 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [7/8] batch={'x': tensor([ 5, 10]), 'y': tensor([10, 20])} weight=0.000 loss=15.000\n", + "[2024-03-14 12:17:50,046 62058:140704275689088][log.py:93 todd.IterBasedTrainer.iter_based_trainer after_run_iter] INFO: Iter [8/8] batch={'x': tensor([2, 4]), 'y': tensor([4, 8])} weight=0.000 loss=6.000\n" ] } ], @@ -337,25 +337,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:26:51,767 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:26:51,768 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]\n", - "[2024-02-23 18:26:51,771 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [1/15] batch={'x': tensor([4, 9]), 'y': tensor([ 8, 18])} weight=0.000 loss=13.000\n", - "[2024-02-23 18:26:51,772 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [2/15] batch={'x': tensor([7, 3]), 'y': tensor([14, 6])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:26:51,774 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [3/15] batch={'x': tensor([10, 2]), 'y': tensor([20, 4])} weight=0.000 loss=12.000\n", - "[2024-02-23 18:26:51,775 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [4/15] batch={'x': tensor([8, 6]), 'y': tensor([16, 12])} weight=0.000 loss=14.000\n", - "[2024-02-23 18:26:51,777 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [5/15] batch={'x': tensor([1, 5]), 'y': tensor([ 2, 10])} weight=0.000 loss=6.000\n", - "[2024-02-23 18:26:51,777 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [2/3]\n", - "[2024-02-23 18:26:51,779 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [6/15] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.000 loss=12.000\n", - "[2024-02-23 18:26:51,780 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [7/15] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.000 loss=5.000\n", - "[2024-02-23 18:26:51,782 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [8/15] batch={'x': tensor([4, 8]), 'y': tensor([ 8, 16])} weight=0.000 loss=12.000\n", - "[2024-02-23 18:26:51,783 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [9/15] batch={'x': tensor([9, 1]), 'y': tensor([18, 2])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:26:51,785 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [10/15] batch={'x': tensor([ 6, 10]), 'y': tensor([12, 20])} weight=0.000 loss=16.000\n", - "[2024-02-23 18:26:51,786 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:26:51,788 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [11/15] batch={'x': tensor([10, 9]), 'y': tensor([20, 18])} weight=0.000 loss=19.000\n", - "[2024-02-23 18:26:51,790 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [12/15] batch={'x': tensor([7, 5]), 'y': tensor([14, 10])} weight=0.000 loss=12.000\n", - "[2024-02-23 18:26:51,793 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [13/15] batch={'x': tensor([8, 4]), 'y': tensor([16, 8])} weight=0.000 loss=12.000\n", - "[2024-02-23 18:26:51,794 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [14/15] batch={'x': tensor([3, 6]), 'y': tensor([ 6, 12])} weight=0.000 loss=9.000\n", - "[2024-02-23 18:26:51,796 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [15/15] batch={'x': tensor([1, 2]), 'y': tensor([2, 4])} weight=0.000 loss=3.000\n" + "\u001b[2m[2024-03-14 12:17:50,057 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.epoch_based_trainer __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:17:50,058 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [1/3]\n", + "[2024-03-14 12:17:50,061 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [1/15] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.000 loss=5.000\n", + "[2024-03-14 12:17:50,064 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [2/15] batch={'x': tensor([8, 6]), 'y': tensor([16, 12])} weight=0.000 loss=14.000\n", + "[2024-03-14 12:17:50,066 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [3/15] batch={'x': tensor([5, 9]), 'y': tensor([10, 18])} weight=0.000 loss=14.000\n", + "[2024-03-14 12:17:50,068 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [4/15] batch={'x': tensor([ 7, 10]), 'y': tensor([14, 20])} weight=0.000 loss=17.000\n", + "[2024-03-14 12:17:50,071 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [5/15] batch={'x': tensor([2, 3]), 'y': tensor([4, 6])} weight=0.000 loss=5.000\n", + "[2024-03-14 12:17:50,073 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [2/3]\n", + "[2024-03-14 12:17:50,075 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [6/15] batch={'x': tensor([7, 8]), 'y': tensor([14, 16])} weight=0.000 loss=15.000\n", + "[2024-03-14 12:17:50,077 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [7/15] batch={'x': tensor([1, 6]), 'y': tensor([ 2, 12])} weight=0.000 loss=7.000\n", + "[2024-03-14 12:17:50,079 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [8/15] batch={'x': tensor([2, 4]), 'y': tensor([4, 8])} weight=0.000 loss=6.000\n", + "[2024-03-14 12:17:50,081 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [9/15] batch={'x': tensor([10, 9]), 'y': tensor([20, 18])} weight=0.000 loss=19.000\n", + "[2024-03-14 12:17:50,082 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [10/15] batch={'x': tensor([3, 5]), 'y': tensor([ 6, 10])} weight=0.000 loss=8.000\n", + "[2024-03-14 12:17:50,084 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.epoch_based_trainer before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:17:50,086 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [11/15] batch={'x': tensor([3, 6]), 'y': tensor([ 6, 12])} weight=0.000 loss=9.000\n", + "[2024-03-14 12:17:50,088 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [12/15] batch={'x': tensor([2, 4]), 'y': tensor([4, 8])} weight=0.000 loss=6.000\n", + "[2024-03-14 12:17:50,090 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [13/15] batch={'x': tensor([5, 8]), 'y': tensor([10, 16])} weight=0.000 loss=13.000\n", + "[2024-03-14 12:17:50,091 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [14/15] batch={'x': tensor([10, 7]), 'y': tensor([20, 14])} weight=0.000 loss=17.000\n", + "[2024-03-14 12:17:50,093 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.epoch_based_trainer after_run_iter] INFO: Iter [15/15] batch={'x': tensor([9, 1]), 'y': tensor([18, 2])} weight=0.000 loss=10.000\n" ] } ], @@ -402,7 +402,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:26:51,926 35174:140704541179520][log.py:55 todd.Validator.log_callback init] INFO: \n", + "[2024-03-14 12:17:50,216 62058:140704275689088][log.py:55 todd.Validator.log_callback init] INFO: \n", "platform: macOS-14.0\n", "nvidia_smi: None\n", "python_version: 3.11.7 (main, Dec 4 2023, 18:10:11) [Clang 15.0.0 (clang-1500.1.0.2.5)]\n", @@ -411,20 +411,30 @@ "opencv_version: 4.7.0\n", "todd_version: 0.4.0\n", "cuda_home: None\n", - "git_commit_id: 8dccf62\n", + "git_commit_id: 0a7955a\n", "git_status: \n", - "M pyproject.toml\n", - " M todd/runners/base.py\n", + "M todd/runners/callbacks/checkpoint.py\n", + " M todd/runners/callbacks/composed.py\n", + " M todd/runners/callbacks/git.py\n", + " M todd/runners/callbacks/interval.py\n", + " M todd/runners/callbacks/log.py\n", + " M todd/runners/callbacks/lr.py\n", + " M todd/runners/callbacks/monitor.py\n", + " M todd/runners/callbacks/tensorboard.py\n", " M todd/runners/epoch_based_trainer.py\n", " M todd/runners/iter_based_trainer.py\n", - " M todd/runners/trainer.py\n", - " M todd/runners/validator.py\n", - " M todd/utils/torch.py\n", - "\u001b[2m[2024-02-23 18:26:51,927 35174:140704541179520][base.py:57 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:26:51,931 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:26:51,934 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", - "[2024-02-23 18:26:51,937 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", - "[2024-02-23 18:26:51,939 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" + " M todd/runners/strategies/base.py\n", + " M todd/runners/strategies/ddp.py\n", + " M todd/runners/strategies/fsdp.py\n", + " M todd/runners/utils.py\n", + " M todd/utils/__init__.py\n", + " M todd/utils/mixins.py\n", + "?? todd/utils/constants.py\n", + "\u001b[2m[2024-03-14 12:17:50,217 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:17:50,221 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:17:50,240 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", + "[2024-03-14 12:17:50,247 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", + "[2024-03-14 12:17:50,262 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" ] } ], @@ -461,11 +471,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:26:51,979 35174:140704541179520][base.py:57 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:26:51,987 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:26:51,991 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", - "[2024-02-23 18:26:52,005 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", - "[2024-02-23 18:26:52,032 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" + "\u001b[2m[2024-03-14 12:17:50,291 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:17:50,294 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:17:50,296 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", + "[2024-03-14 12:17:50,299 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", + "[2024-03-14 12:17:50,301 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" ] }, { @@ -473,17 +483,17 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpe_f9kq5u\u001b[0m\n", + "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpypx13e0i\u001b[0m\n", "└── \u001b[1;36mlog_callback\u001b[0m\n", - " └── 2024-02-23T18-26-51_978897-08-00.log\n", + " └── 2024-03-14T12-17-50_290843-08-00.log\n", "\n", "2 directories, 1 file\n", "\n", - "[2024-02-23 18:26:51,979 35174:140704541179520][base.py:57 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\n", - "[2024-02-23 18:26:51,987 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:26:51,991 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", - "[2024-02-23 18:26:52,005 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", - "[2024-02-23 18:26:52,032 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" + "[2024-03-14 12:17:50,291 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\n", + "[2024-03-14 12:17:50,294 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:17:50,296 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", + "[2024-03-14 12:17:50,299 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", + "[2024-03-14 12:17:50,301 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" ] } ], @@ -523,11 +533,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:26:52,618 35174:140704541179520][base.py:57 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:26:53,137 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:01 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:26:53,652 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:01 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", - "[2024-02-23 18:26:54,162 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:00 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", - "[2024-02-23 18:26:54,678 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" + "\u001b[2m[2024-03-14 12:17:50,870 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:17:51,391 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:01 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:17:51,912 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:01 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", + "[2024-03-14 12:17:52,431 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:00 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", + "[2024-03-14 12:17:52,945 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" ] } ], @@ -567,11 +577,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:26:54,690 35174:140704541179520][base.py:57 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:26:56,210 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:04 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:27:00,228 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:05 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", - "[2024-02-23 18:27:05,240 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:03 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", - "[2024-02-23 18:27:10,261 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" + "\u001b[2m[2024-03-14 12:17:52,955 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:17:54,468 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:04 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:17:58,481 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:05 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", + "[2024-03-14 12:18:03,499 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:03 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", + "[2024-03-14 12:18:08,518 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" ] } ], @@ -611,7 +621,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:27:10,384 35174:140704541179520][log.py:55 todd.Validator.log_callback init] INFO: \n", + "[2024-03-14 12:18:08,647 62058:140704275689088][log.py:55 todd.Validator.log_callback init] INFO: \n", "platform: macOS-14.0\n", "nvidia_smi: None\n", "python_version: 3.11.7 (main, Dec 4 2023, 18:10:11) [Clang 15.0.0 (clang-1500.1.0.2.5)]\n", @@ -620,20 +630,30 @@ "opencv_version: 4.7.0\n", "todd_version: 0.4.0\n", "cuda_home: None\n", - "git_commit_id: 8dccf62\n", + "git_commit_id: 0a7955a\n", "git_status: \n", - "M pyproject.toml\n", - " M todd/runners/base.py\n", + "M todd/runners/callbacks/checkpoint.py\n", + " M todd/runners/callbacks/composed.py\n", + " M todd/runners/callbacks/git.py\n", + " M todd/runners/callbacks/interval.py\n", + " M todd/runners/callbacks/log.py\n", + " M todd/runners/callbacks/lr.py\n", + " M todd/runners/callbacks/monitor.py\n", + " M todd/runners/callbacks/tensorboard.py\n", " M todd/runners/epoch_based_trainer.py\n", " M todd/runners/iter_based_trainer.py\n", - " M todd/runners/trainer.py\n", - " M todd/runners/validator.py\n", - " M todd/utils/torch.py\n", - "\u001b[2m[2024-02-23 18:27:10,386 35174:140704541179520][base.py:57 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:10,390 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:00 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:27:10,393 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:00 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", - "[2024-02-23 18:27:10,395 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:00 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", - "[2024-02-23 18:27:10,398 35174:140704541179520][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" + " M todd/runners/strategies/base.py\n", + " M todd/runners/strategies/ddp.py\n", + " M todd/runners/strategies/fsdp.py\n", + " M todd/runners/utils.py\n", + " M todd/utils/__init__.py\n", + " M todd/utils/mixins.py\n", + "?? todd/utils/constants.py\n", + "\u001b[2m[2024-03-14 12:18:08,648 62058:140704275689088][base.py:56 todd.Validator.log_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:08,653 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [5/20] ETA 0:00:00 batch={'x': tensor([5]), 'y': tensor([10])} weight=0.000 loss=10.000\n", + "[2024-03-14 12:18:08,655 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [10/20] ETA 0:00:00 batch={'x': tensor([10]), 'y': tensor([20])} weight=0.000 loss=20.000\n", + "[2024-03-14 12:18:08,658 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [15/20] ETA 0:00:00 batch={'x': tensor([15]), 'y': tensor([30])} weight=0.000 loss=30.000\n", + "[2024-03-14 12:18:08,660 62058:140704275689088][log.py:93 todd.Validator.log_callback after_run_iter] INFO: Iter [20/20] ETA 0:00:00 batch={'x': tensor([20]), 'y': tensor([40])} weight=0.000 loss=40.000\n" ] } ], @@ -679,8 +699,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:27:10,464 35174:140704541179520][git.py:41 todd.Validator.git_callback init] INFO: Saving git diff to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpk3uvcez8/git_callback/git_diff_2024-02-23T18-27-10_464496-08-00.log\n", - "\u001b[2m[2024-02-23 18:27:10,467 35174:140704541179520][base.py:57 todd.Validator.git_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n" + "[2024-03-14 12:18:08,732 62058:140704275689088][git.py:41 todd.Validator.git_callback init] INFO: Saving git diff to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpotx2rbek/git_callback/git_diff_2024-03-14T12-18-08_732484-08-00.log\n", + "\u001b[2m[2024-03-14 12:18:08,735 62058:140704275689088][base.py:56 todd.Validator.git_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n" ] }, { @@ -688,249 +708,461 @@ "output_type": "stream", "text": [ "\n", - "diff --git a/pyproject.toml b/pyproject.toml\n", - "index aa38442..487cb54 100644\n", - "--- a/pyproject.toml\n", - "+++ b/pyproject.toml\n", - "@@ -133,6 +133,7 @@ module = [\n", - " 'ipdb.*',\n", - " 'mmcv.*',\n", - " 'pptx.*',\n", - "+ 'setuptools.*',\n", - " 'torchvision.*',\n", - " 'yapf.*',\n", - " ]\n", - "diff --git a/todd/runners/base.py b/todd/runners/base.py\n", - "index fda811f..e16bbfd 100644\n", - "--- a/todd/runners/base.py\n", - "+++ b/todd/runners/base.py\n", - "@@ -2,6 +2,7 @@ __all__ = [\n", - " 'BaseRunner',\n", - " ]\n", + "diff --git a/todd/runners/callbacks/checkpoint.py b/todd/runners/callbacks/checkpoint.py\n", + "index 09548a2..caba779 100644\n", + "--- a/todd/runners/callbacks/checkpoint.py\n", + "+++ b/todd/runners/callbacks/checkpoint.py\n", + "@@ -36,27 +36,27 @@ class CheckpointCallback(IntervalMixin, BaseCallback):\n", " \n", - "+from abc import abstractmethod\n", - " import contextlib\n", - " import getpass\n", - " import logging\n", - "@@ -105,8 +106,9 @@ class BaseRunner(StateDictMixin, Generic[T]):\n", - " return self._logger\n", + " def init(self, *args, **kwargs) -> None:\n", + " super().init(*args, **kwargs)\n", + "- self._checkpoint_dir = self._runner.work_dir / 'checkpoints'\n", + "+ self._checkpoint_dir = self.runner.work_dir / 'checkpoints'\n", + " self._latest_checkpoint_dir = self._checkpoint_dir / 'latest'\n", " \n", - " @property\n", - "+ @abstractmethod\n", - " def iters(self) -> int:\n", - "- return len(self._dataloader)\n", - "+ pass\n", + " self._checkpoint_dir.mkdir(parents=True, exist_ok=True)\n", " \n", - " def _build_strategy(\n", - " self,\n", - "@@ -255,7 +257,7 @@ class BaseRunner(StateDictMixin, Generic[T]):\n", - " return memo\n", + "- if self._runner._auto_resume and self._latest_checkpoint_dir.exists():\n", + "+ if self.runner._auto_resume and self._latest_checkpoint_dir.exists():\n", + " load_from = self._latest_checkpoint_dir\n", + "- elif self._runner.load_from is not None:\n", + "- load_from = pathlib.Path(self._runner.load_from)\n", + "+ elif self.runner.load_from is not None:\n", + "+ load_from = pathlib.Path(self.runner.load_from)\n", + " assert load_from.exists()\n", + " else:\n", + " load_from = None\n", " \n", - " def _setup(self) -> Memo:\n", - "- return dict(dataloader=iter(self._dataloader))\n", - "+ return dict()\n", + " if load_from is not None:\n", + " if get_rank() == 0:\n", + "- self._runner.logger.info(\"Loading from %s\", load_from)\n", + "+ self.runner.logger.info(\"Loading from %s\", load_from)\n", + " state_dict = {\n", + " f.stem: torch.load(f, 'cpu')\n", + " for f in load_from.glob('*.pth')\n", + " }\n", + "- self._runner.load_state_dict(state_dict, **self._load_state_dict)\n", + "+ self.runner.load_state_dict(state_dict, **self._load_state_dict)\n", " \n", - " def _teardown(self, memo: Memo) -> None:\n", - " pass\n", - "diff --git a/todd/runners/epoch_based_trainer.py b/todd/runners/epoch_based_trainer.py\n", - "index 73ff6b7..8c2a517 100644\n", - "--- a/todd/runners/epoch_based_trainer.py\n", - "+++ b/todd/runners/epoch_based_trainer.py\n", - "@@ -9,6 +9,8 @@ from typing import TypeVar\n", + " @property\n", + " def checkpoint_dir(self) -> pathlib.Path:\n", + "@@ -71,13 +71,13 @@ class CheckpointCallback(IntervalMixin, BaseCallback):\n", " \n", - " from torch import nn\n", + " def _save(self, name: str) -> None:\n", + " # for FSDP, all ranks should call state dict\n", + "- state_dict = self._runner.state_dict(**self._state_dict)\n", + "+ state_dict = self.runner.state_dict(**self._state_dict)\n", " \n", - "+from ..utils import set_epoch\n", - "+\n", - " from ..base import RunnerRegistry\n", - " from .trainer import Trainer\n", - " from .types import Memo\n", - "@@ -23,17 +25,9 @@ class EpochBasedTrainer(Trainer):\n", + " if get_rank() != 0:\n", + " return\n", + " work_dir = self._work_dir(name)\n", + " work_dir.mkdir(parents=True, exist_ok=True)\n", + "- self._runner.logger.info(\"Saving state dict to %s\", work_dir)\n", + "+ self.runner.logger.info(\"Saving state dict to %s\", work_dir)\n", + " for k, v in state_dict.items():\n", + " torch.save(v, work_dir / f'{k}.pth')\n", + " \n", + "@@ -88,7 +88,7 @@ class CheckpointCallback(IntervalMixin, BaseCallback):\n", + " def after_run_iter(self, batch, memo: Memo) -> None:\n", + " super().after_run_iter(batch, memo)\n", + " if self._should_run_iter():\n", + "- self._save(f'iter_{self._runner.iter_}')\n", + "+ self._save(f'iter_{self.runner.iter_}')\n", + " \n", + " def after_run_epoch(self, epoch_memo: Memo, memo: Memo) -> None:\n", + " super().after_run_epoch(epoch_memo, memo)\n", + "diff --git a/todd/runners/callbacks/composed.py b/todd/runners/callbacks/composed.py\n", + "index a1c1638..83bec29 100644\n", + "--- a/todd/runners/callbacks/composed.py\n", + "+++ b/todd/runners/callbacks/composed.py\n", + "@@ -21,7 +21,7 @@ class ComposedCallback(BaseCallback):\n", " super().__init__(*args, **kwargs)\n", - " self._epochs = epochs\n", + " priorities = [c.pop('priority', dict()) for c in callbacks]\n", + " queue = [\n", + "- CallbackRegistry.build(c, runner=self._runner) for c in callbacks\n", + "+ CallbackRegistry.build(c, runner=self.runner) for c in callbacks\n", + " ]\n", + " self._priority_queue: PriorityQueue[KT, BaseCallback] = \\\n", + " PriorityQueue(priorities, queue)\n", + "diff --git a/todd/runners/callbacks/git.py b/todd/runners/callbacks/git.py\n", + "index ce642ac..ab90b0f 100644\n", + "--- a/todd/runners/callbacks/git.py\n", + "+++ b/todd/runners/callbacks/git.py\n", + "@@ -33,10 +33,10 @@ class GitCallback(BaseCallback):\n", + " diff = subprocess_run(args_)\n", + " except subprocess.CalledProcessError as e:\n", + " diff = str(e)\n", + "- self._runner.logger.error(e)\n", + "+ self.runner.logger.error(e)\n", + " else:\n", + " file = (\n", + "- self._runner.work_dir / f'git_diff_{get_timestamp()}.log'\n", + "+ self.runner.work_dir / f'git_diff_{get_timestamp()}.log'\n", + " )\n", + "- self._runner.logger.info('Saving git diff to %s', file)\n", + "+ self.runner.logger.info('Saving git diff to %s', file)\n", + " file.write_text(diff)\n", + "diff --git a/todd/runners/callbacks/interval.py b/todd/runners/callbacks/interval.py\n", + "index 8eb9625..489dd21 100644\n", + "--- a/todd/runners/callbacks/interval.py\n", + "+++ b/todd/runners/callbacks/interval.py\n", + "@@ -22,7 +22,7 @@ class IntervalMixin(BaseCallback):\n", + " return self._interval > 0 and step % self._interval == 0\n", " \n", - "- @property\n", - "- def epoch(self) -> int:\n", - "- return self._iter // super().iters\n", - "-\n", - "- @property\n", - "- def inner_iter(self) -> int:\n", - "- return self._iter % super().iters\n", - "-\n", - " @property\n", - " def iters(self) -> int:\n", - "- return super().iters * self._epochs\n", - "+ return self.iters_per_epoch * self._epochs\n", + " def _should_run_iter(self) -> bool:\n", + "- return not self._by_epoch and self.__should_run(self._runner.iter_)\n", + "+ return not self._by_epoch and self.__should_run(self.runner.iter_)\n", " \n", - " @property\n", - " def epochs(self) -> int:\n", - "@@ -43,22 +37,13 @@ class EpochBasedTrainer(Trainer):\n", - " return super()._run(epoch_memo)\n", + " def _should_run_epoch(self) -> bool:\n", + " return (\n", + "diff --git a/todd/runners/callbacks/log.py b/todd/runners/callbacks/log.py\n", + "index 8f884f5..263e6b4 100644\n", + "--- a/todd/runners/callbacks/log.py\n", + "+++ b/todd/runners/callbacks/log.py\n", + "@@ -43,24 +43,24 @@ class LogCallback(IntervalMixin, BaseCallback):\n", + " if get_rank() > 0:\n", + " return\n", + " if self._with_file_handler:\n", + "- file = self._runner.work_dir / f'{get_timestamp()}.log'\n", + "+ file = self.runner.work_dir / f'{get_timestamp()}.log'\n", + " handler = logging.FileHandler(file)\n", + " handler.setFormatter(Formatter())\n", + "- self._runner.logger.addHandler(handler)\n", + "+ self.runner.logger.addHandler(handler)\n", + " if self._collect_env is not None:\n", + " from ...base import ( # noqa: E501 pylint: disable=import-outside-toplevel\n", + " collect_env,\n", + " )\n", + " env = collect_env(**self._collect_env)\n", + "- self._runner.logger.info(env)\n", + "+ self.runner.logger.info(env)\n", " \n", - " def _setup_epoch(self, memo: Memo) -> Memo:\n", - "- samplers = [\n", - "- self._dataloader.sampler,\n", - "- self._dataloader.batch_sampler,\n", - "- getattr(self._dataloader.batch_sampler, 'sampler', None),\n", - "- ]\n", - "- for sampler in samplers:\n", - "- if (set_epoch := getattr(sampler, 'set_epoch', None)) is not None:\n", - "- set_epoch(self.epoch)\n", - " epoch_memo = super()._setup()\n", - "- dataloader = epoch_memo['dataloader']\n", - "- dataloader = itertools.islice(\n", - "- dataloader,\n", - "- super().iters - self.inner_iter,\n", - "- )\n", - "+ set_epoch(self._dataloader, self.epoch)\n", - " epoch_memo.update(\n", - "- dataloader=dataloader,\n", - "+ dataloader=(\n", - "+ itertools.islice(self._dataloader, self.inner_iter, None)\n", - "+ if self.inner_iter > 0 else self._dataloader\n", - "+ ),\n", - " epoch=defaultdict(list),\n", + " def before_run(self, memo: Memo) -> None:\n", + " super().before_run(memo)\n", + " self._eta: BaseETA | None = (\n", + " None if self._eta_config is None else ETARegistry.build(\n", + " self._eta_config,\n", + "- start=self._runner.iter_ - 1,\n", + "- end=self._runner.iters,\n", + "+ start=self.runner.iter_ - 1,\n", + "+ end=self.runner.iters,\n", + " )\n", " )\n", - " return epoch_memo\n", + " \n", + "@@ -73,10 +73,10 @@ class LogCallback(IntervalMixin, BaseCallback):\n", + " super().after_run_iter(batch, memo)\n", + " if 'log' not in memo:\n", + " return\n", + "- prefix = f\"Iter [{self._runner.iter_}/{self._runner.iters}] \"\n", + "+ prefix = f\"Iter [{self.runner.iter_}/{self.runner.iters}] \"\n", + " \n", + " if self._eta is not None:\n", + "- eta = self._eta(self._runner.iter_)\n", + "+ eta = self._eta(self.runner.iter_)\n", + " eta = round(eta)\n", + " prefix += f\"ETA {str(datetime.timedelta(seconds=eta))} \"\n", + " \n", + "@@ -90,7 +90,7 @@ class LogCallback(IntervalMixin, BaseCallback):\n", + " \n", + " log: dict[str, Any] = memo.pop('log')\n", + " message = ' '.join(f'{k}={v}' for k, v in log.items() if v is not None)\n", + "- self._runner.logger.info(prefix + message)\n", + "+ self.runner.logger.info(prefix + message)\n", + " \n", + " def before_run_epoch(self, epoch_memo: Memo, memo: Memo) -> None:\n", + " super().before_run_epoch(epoch_memo, memo)\n", + "diff --git a/todd/runners/callbacks/lr.py b/todd/runners/callbacks/lr.py\n", + "index 977558b..6a597c9 100644\n", + "--- a/todd/runners/callbacks/lr.py\n", + "+++ b/todd/runners/callbacks/lr.py\n", + "@@ -26,7 +26,7 @@ class LRScheduleCallback(IntervalMixin, BaseCallback):\n", + " **kwargs,\n", + " ) -> None:\n", + " super().__init__(*args, interval=interval, **kwargs)\n", + "- assert isinstance(self._runner, Trainer)\n", + "+ assert isinstance(self.runner, Trainer)\n", + " self._lr_scheduler_config = lr_scheduler\n", + " \n", + " def init(self, *args, **kwargs) -> None:\n", + "@@ -34,7 +34,7 @@ class LRScheduleCallback(IntervalMixin, BaseCallback):\n", + " self._build_lr_scheduler()\n", + " \n", + " def _build_lr_scheduler(self) -> None:\n", + "- runner = cast(Trainer, self._runner)\n", + "+ runner = cast(Trainer, self.runner)\n", + " self._lr_scheduler: torch.optim.lr_scheduler.LRScheduler = \\\n", + " LRSchedulerRegistry.build(\n", + " self._lr_scheduler_config,\n", + "@@ -75,11 +75,11 @@ class LRScaleCallback(BaseCallback):\n", + " \n", + " def __init__(self, *args, lr_scaler: Config, **kwargs) -> None:\n", + " super().__init__(*args, **kwargs)\n", + "- assert isinstance(self._runner, Trainer)\n", + "+ assert isinstance(self.runner, Trainer)\n", + " self._lr_scaler_config = lr_scaler\n", + " \n", + " def _scale_lr(self, config: Config) -> None:\n", + "- runner = cast(Trainer, self._runner)\n", + "+ runner = cast(Trainer, self.runner)\n", + " assert runner.dataloader.batch_size is not None\n", + " base_batch_size = config.base_batch_size\n", + " batch_size = get_world_size() * runner.dataloader.batch_size\n", + "diff --git a/todd/runners/callbacks/monitor.py b/todd/runners/callbacks/monitor.py\n", + "index 98aabf2..0895477 100644\n", + "--- a/todd/runners/callbacks/monitor.py\n", + "+++ b/todd/runners/callbacks/monitor.py\n", + "@@ -37,8 +37,8 @@ class MonitorCallback(BaseCallback):\n", + " ) -> None:\n", + " super().run_iter_context(exit_stack, batch, memo)\n", + " context = Context(\n", + "- self._runner.logger,\n", + "- iter_=self._runner.iter_,\n", + "+ self.runner.logger,\n", + "+ iter_=self.runner.iter_,\n", + " batch=batch,\n", + " memo=memo,\n", + " )\n", + "diff --git a/todd/runners/callbacks/tensorboard.py b/todd/runners/callbacks/tensorboard.py\n", + "index 65eb9bc..804f907 100644\n", + "--- a/todd/runners/callbacks/tensorboard.py\n", + "+++ b/todd/runners/callbacks/tensorboard.py\n", + "@@ -31,7 +31,7 @@ class TensorBoardCallback(IntervalMixin, BaseCallback):\n", + " super().init(*args, **kwargs)\n", + " if get_rank() > 0:\n", + " return\n", + "- log_dir = self._runner.work_dir / 'tensorboard'\n", + "+ log_dir = self.runner.work_dir / 'tensorboard'\n", + " self._summary_writer = SummaryWriter(\n", + " log_dir,\n", + " **self._summary_writer_config,\n", + "diff --git a/todd/runners/epoch_based_trainer.py b/todd/runners/epoch_based_trainer.py\n", + "index bc46346..8358d00 100644\n", + "--- a/todd/runners/epoch_based_trainer.py\n", + "+++ b/todd/runners/epoch_based_trainer.py\n", + "@@ -18,7 +18,7 @@ T = TypeVar('T', bound=nn.Module)\n", + " \n", + " \n", + " @RunnerRegistry.register_()\n", + "-class EpochBasedTrainer(Trainer):\n", + "+class EpochBasedTrainer(Trainer[T]):\n", + " \n", + " def __init__(self, *args, epochs: int, **kwargs) -> None:\n", + " super().__init__(*args, **kwargs)\n", "diff --git a/todd/runners/iter_based_trainer.py b/todd/runners/iter_based_trainer.py\n", - "index bc507f1..2f69a9d 100644\n", + "index 107c894..7a212fa 100644\n", "--- a/todd/runners/iter_based_trainer.py\n", "+++ b/todd/runners/iter_based_trainer.py\n", - "@@ -3,10 +3,12 @@ __all__ = [\n", + "@@ -16,7 +16,7 @@ T = TypeVar('T', bound=nn.Module)\n", + " \n", + " \n", + " @RunnerRegistry.register_()\n", + "-class IterBasedTrainer(Trainer):\n", + "+class IterBasedTrainer(Trainer[T]):\n", + " \n", + " def __init__(self, *args, iters: int, **kwargs) -> None:\n", + " super().__init__(*args, **kwargs)\n", + "diff --git a/todd/runners/strategies/base.py b/todd/runners/strategies/base.py\n", + "index a97d30f..d6c2a74 100644\n", + "--- a/todd/runners/strategies/base.py\n", + "+++ b/todd/runners/strategies/base.py\n", + "@@ -2,7 +2,7 @@ __all__ = [\n", + " 'BaseStrategy',\n", " ]\n", " \n", - " import itertools\n", - "-from typing import TypeVar\n", - "+from typing import Any, Generator, TypeVar\n", + "-from typing import Any, Generic, Mapping, TypeVar, cast\n", + "+from typing import Any, Mapping, TypeVar, cast\n", " \n", + " import torch\n", " from torch import nn\n", + "@@ -15,7 +15,7 @@ T = TypeVar('T', bound=nn.Module)\n", " \n", - "+from ..utils.torch import set_epoch\n", - "+\n", - " from ..base import RunnerRegistry\n", - " from .trainer import Trainer\n", - " from .types import Memo\n", - "@@ -26,10 +28,23 @@ class IterBasedTrainer(Trainer):\n", - " def iters(self) -> int:\n", - " return self._iters\n", " \n", - "+ def _iterate_dataloader(self) -> Generator[Any, None, None]:\n", - "+ if self.inner_iter > 0:\n", - "+ set_epoch(self._dataloader, self.epoch)\n", - "+ yield from itertools.islice(\n", - "+ self._dataloader,\n", - "+ self.inner_iter,\n", - "+ self.iters - self.iters_per_epoch * self.epoch,\n", - "+ )\n", - "+ while self._iter < self.iters:\n", - "+ assert self.inner_iter == 0\n", - "+ set_epoch(self._dataloader, self.epoch)\n", - "+ yield from itertools.islice(\n", - "+ self._dataloader,\n", - "+ self.iters - self._iter,\n", - "+ )\n", - "+\n", - " def _setup(self) -> Memo:\n", - " memo = super()._setup()\n", - "- dataloader = memo['dataloader']\n", - "- dataloader = itertools.cycle(dataloader)\n", - "- dataloader = itertools.islice(dataloader, self._iters - self._iter)\n", - "- memo['dataloader'] = dataloader\n", - "+ memo['dataloader'] = self._iterate_dataloader()\n", - " return memo\n", - "diff --git a/todd/runners/trainer.py b/todd/runners/trainer.py\n", - "index 40ecbe6..449c8b7 100644\n", - "--- a/todd/runners/trainer.py\n", - "+++ b/todd/runners/trainer.py\n", - "@@ -2,6 +2,7 @@ __all__ = [\n", - " 'Trainer',\n", - " ]\n", + " @StrategyRegistry.register_()\n", + "-class BaseStrategy(RunnerHolderMixin, StateDictMixin, Generic[T]):\n", + "+class BaseStrategy(RunnerHolderMixin[T], StateDictMixin):\n", + " \n", + " def __init__(\n", + " self,\n", + "@@ -45,7 +45,7 @@ class BaseStrategy(RunnerHolderMixin, StateDictMixin, Generic[T]):\n", " \n", - "+from abc import ABC\n", - " from typing import Any, Mapping, TypeVar\n", + " @property\n", + " def module(self) -> nn.Module:\n", + "- return self._runner.model\n", + "+ return self.runner.model\n", " \n", - " import torch\n", - "@@ -15,7 +16,19 @@ T = TypeVar('T', bound=nn.Module)\n", + " def model_state_dict(self, *args, **kwargs) -> dict[str, Any]:\n", + " return self.module.state_dict(*args, **kwargs)\n", + "@@ -62,7 +62,7 @@ class BaseStrategy(RunnerHolderMixin, StateDictMixin, Generic[T]):\n", + " **kwargs,\n", + " )\n", + " if get_rank() == 0:\n", + "- self._runner.logger.info(incompatible_keys)\n", + "+ self.runner.logger.info(incompatible_keys)\n", " \n", + " def load_model_from(\n", + " self,\n", + "@@ -77,7 +77,7 @@ class BaseStrategy(RunnerHolderMixin, StateDictMixin, Generic[T]):\n", + " model_state_dict = dict()\n", + " for f_ in f_list:\n", + " if get_rank() == 0:\n", + "- self._runner.logger.info(\"Loading model from %s\", f_)\n", + "+ self.runner.logger.info(\"Loading model from %s\", f_)\n", + " model_state_dict.update(torch.load(f_, 'cpu'))\n", + " self.load_model_state_dict(model_state_dict, *args, **kwargs)\n", " \n", - " @RunnerRegistry.register_()\n", - "-class Trainer(BaseRunner[T]):\n", - "+class Trainer(BaseRunner[T], ABC):\n", + "diff --git a/todd/runners/strategies/ddp.py b/todd/runners/strategies/ddp.py\n", + "index 3367c2a..e2e9fd6 100644\n", + "--- a/todd/runners/strategies/ddp.py\n", + "+++ b/todd/runners/strategies/ddp.py\n", + "@@ -23,4 +23,4 @@ class DDPStrategy(CUDAStrategy[T]):\n", + " \n", + " @property\n", + " def module(self) -> nn.Module:\n", + "- return self._runner.model.module\n", + "+ return self.runner.model.module\n", + "diff --git a/todd/runners/strategies/fsdp.py b/todd/runners/strategies/fsdp.py\n", + "index fa71105..42eb809 100644\n", + "--- a/todd/runners/strategies/fsdp.py\n", + "+++ b/todd/runners/strategies/fsdp.py\n", + "@@ -26,13 +26,13 @@ class FSDPStrategy(CUDAStrategy[T]):\n", + " \n", + " @property\n", + " def module(self) -> nn.Module:\n", + "- return self._runner.model.module\n", + "+ return self.runner.model.module\n", + " \n", + " def build_optimizer(self, config: Config) -> torch.optim.Optimizer:\n", + "- return OptimizerRegistry.build(config, model=self._runner.model)\n", + "+ return OptimizerRegistry.build(config, model=self.runner.model)\n", + " \n", + " def model_state_dict(self, *args, **kwargs) -> dict[str, Any]:\n", + "- return self._runner.model.state_dict(*args, **kwargs)\n", + "+ return self.runner.model.state_dict(*args, **kwargs)\n", + " \n", + " def load_model_state_dict(\n", + " self,\n", + "@@ -40,7 +40,7 @@ class FSDPStrategy(CUDAStrategy[T]):\n", + " *args,\n", + " **kwargs,\n", + " ) -> None:\n", + "- self._runner.model.load_state_dict(state_dict, *args, **kwargs)\n", + "+ self.runner.model.load_state_dict(state_dict, *args, **kwargs)\n", + " \n", + " def optim_state_dict(\n", + " self,\n", + "diff --git a/todd/runners/utils.py b/todd/runners/utils.py\n", + "index 0f25783..9c72605 100644\n", + "--- a/todd/runners/utils.py\n", + "+++ b/todd/runners/utils.py\n", + "@@ -2,42 +2,45 @@ __all__ = [\n", + " 'RunnerHolderMixin',\n", + " ]\n", + " \n", + "-import weakref\n", + "-from typing import cast\n", + "+from typing import TypeVar\n", + " \n", + "+from torch import nn\n", "+\n", - "+ @property\n", - "+ def iters_per_epoch(self) -> int:\n", - "+ return len(self._dataloader)\n", + "+from ..utils import HolderMixin\n", + " from .base import BaseRunner\n", + " from .epoch_based_trainer import EpochBasedTrainer\n", + " from .iter_based_trainer import IterBasedTrainer\n", + " from .trainer import Trainer\n", + " from .validator import Validator\n", + " \n", + "+T = TypeVar('T', bound=nn.Module)\n", "+\n", - "+ @property\n", - "+ def inner_iter(self) -> int:\n", - "+ return self._iter % self.iters_per_epoch\n", + " \n", + "-class RunnerHolderMixin:\n", + "+class RunnerHolderMixin(HolderMixin[BaseRunner[T]]):\n", + " \n", + "- def __init__(self, *args, runner: BaseRunner, **kwargs) -> None:\n", + "- super().__init__(*args, **kwargs)\n", + "- runner_proxy = (\n", + "- runner if isinstance(runner, weakref.ProxyTypes) else\n", + "- weakref.proxy(runner)\n", + "- )\n", + "- self._runner = cast(BaseRunner, runner_proxy)\n", + "+ def __init__(self, *args, runner: BaseRunner[T], **kwargs) -> None:\n", + "+ super().__init__(*args, instance=runner, **kwargs)\n", "+\n", "+ @property\n", - "+ def epoch(self) -> int:\n", - "+ return self._iter // self.iters_per_epoch\n", + "+ def runner(self) -> BaseRunner[T]:\n", + "+ return self._instance\n", " \n", " @property\n", - " def optimizer(self) -> torch.optim.Optimizer:\n", - "diff --git a/todd/runners/validator.py b/todd/runners/validator.py\n", - "index a70132a..c55f18b 100644\n", - "--- a/todd/runners/validator.py\n", - "+++ b/todd/runners/validator.py\n", - "@@ -16,9 +16,15 @@ T = TypeVar('T', bound=nn.Module)\n", - " @RunnerRegistry.register_()\n", - " class Validator(BaseRunner[T]):\n", + "- def trainer(self) -> Trainer:\n", + "- assert isinstance(self._runner, Trainer)\n", + "- return self._runner\n", + "+ def trainer(self) -> Trainer[T]:\n", + "+ assert isinstance(self._instance, Trainer)\n", + "+ return self._instance\n", " \n", - "+ @property\n", - "+ def iters(self) -> int:\n", - "+ return len(self._dataloader)\n", - "+\n", - " def _setup(self) -> Memo:\n", - " self._model.eval()\n", - "- return super()._setup()\n", - "+ memo = super()._setup()\n", - "+ memo['dataloader'] = self._dataloader\n", - "+ return memo\n", + " @property\n", + "- def validator(self) -> Validator:\n", + "- assert isinstance(self._runner, Validator)\n", + "- return self._runner\n", + "+ def validator(self) -> Validator[T]:\n", + "+ assert isinstance(self._instance, Validator)\n", + "+ return self._instance\n", " \n", - " @torch.no_grad()\n", - " def run(self) -> Memo:\n", - "diff --git a/todd/utils/torch.py b/todd/utils/torch.py\n", - "index 05ec37b..f574616 100644\n", - "--- a/todd/utils/torch.py\n", - "+++ b/todd/utils/torch.py\n", - "@@ -5,6 +5,7 @@ __all__ = [\n", - " 'all_gather',\n", - " 'all_gather_',\n", - " 'all_sync',\n", - "+ 'set_epoch',\n", - " 'Shape',\n", - " 'ModuleList',\n", - " 'ModuleDict',\n", - "@@ -20,6 +21,7 @@ from typing import TYPE_CHECKING\n", - " import torch\n", - " import torch.distributed as dist\n", - " from torch import nn\n", - "+from torch.utils.data import DataLoader\n", + " @property\n", + "- def iter_based_trainer(self) -> IterBasedTrainer:\n", + "- assert isinstance(self._runner, IterBasedTrainer)\n", + "- return self._runner\n", + "+ def iter_based_trainer(self) -> IterBasedTrainer[T]:\n", + "+ assert isinstance(self._instance, IterBasedTrainer)\n", + "+ return self._instance\n", " \n", + " @property\n", + "- def epoch_based_trainer(self) -> EpochBasedTrainer:\n", + "- assert isinstance(self._runner, EpochBasedTrainer)\n", + "- return self._runner\n", + "+ def epoch_based_trainer(self) -> EpochBasedTrainer[T]:\n", + "+ assert isinstance(self._instance, EpochBasedTrainer)\n", + "+ return self._instance\n", + "diff --git a/todd/utils/__init__.py b/todd/utils/__init__.py\n", + "index 85d1d65..199b7df 100644\n", + "--- a/todd/utils/__init__.py\n", + "+++ b/todd/utils/__init__.py\n", + "@@ -1,3 +1,4 @@\n", + "+from .constants import *\n", + " from .enums import *\n", + " from .generic_tensors import *\n", + " from .metas import *\n", + "diff --git a/todd/utils/mixins.py b/todd/utils/mixins.py\n", + "index 9574ea1..3e9a3cc 100644\n", + "--- a/todd/utils/mixins.py\n", + "+++ b/todd/utils/mixins.py\n", + "@@ -1,8 +1,12 @@\n", + " __all__ = [\n", + " 'StateDictMixin',\n", + "+ 'HolderMixin',\n", + " ]\n", " \n", - " def get_rank(*args, **kwargs) -> int:\n", - "@@ -101,6 +103,18 @@ def all_sync(x: torch.Tensor) -> bool:\n", - " return torch.allclose(x, x_prime)\n", + "-from typing import Any, Mapping\n", + "+import weakref\n", + "+from typing import Any, Generic, Mapping, TypeVar, cast\n", + "+\n", + "+T = TypeVar('T')\n", " \n", " \n", - "+def set_epoch(dataloader: DataLoader, epoch: int) -> None:\n", - "+ samplers = [\n", - "+ dataloader.sampler,\n", - "+ dataloader.batch_sampler,\n", - "+ getattr(dataloader.batch_sampler, 'sampler', None),\n", - "+ ]\n", - "+ for sampler in samplers:\n", - "+ set_epoch_ = getattr(sampler, 'set_epoch', None)\n", - "+ if set_epoch_ is not None:\n", - "+ set_epoch_(epoch)\n", - "+\n", - "+\n", - " class Shape:\n", + " class StateDictMixin:\n", + "@@ -19,4 +23,12 @@ class StateDictMixin:\n", + " pass\n", " \n", - " @classmethod\n" + " \n", + "-# TODO: define holder mixin\n", + "+class HolderMixin(Generic[T]):\n", + "+\n", + "+ def __init__(self, *args, instance: T, **kwargs) -> None:\n", + "+ super().__init__(*args, **kwargs)\n", + "+ instance_proxy = (\n", + "+ instance if isinstance(instance, weakref.ProxyTypes) else\n", + "+ weakref.proxy(instance)\n", + "+ )\n", + "+ self._instance = cast(T, instance_proxy)\n" ] } ], @@ -972,15 +1204,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:10,779 35174:140704541179520][base.py:57 todd.IterBasedTrainer.optimize_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:10,784 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([5, 8]), 'y': tensor([10, 16])} weight=0.000 loss=13.000\n", - "[2024-02-23 18:27:10,786 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([9, 1]), 'y': tensor([18, 2])} weight=0.032 loss=9.838\n", - "[2024-02-23 18:27:10,787 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([6, 3]), 'y': tensor([12, 6])} weight=0.057 loss=8.741\n", - "[2024-02-23 18:27:10,789 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([10, 2]), 'y': tensor([20, 4])} weight=0.080 loss=11.520\n", - "[2024-02-23 18:27:10,791 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([7, 4]), 'y': tensor([14, 8])} weight=0.110 loss=10.395\n", - "[2024-02-23 18:27:10,794 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([ 8, 10]), 'y': tensor([16, 20])} weight=0.138 loss=16.763\n", - "[2024-02-23 18:27:10,796 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([3, 5]), 'y': tensor([ 6, 10])} weight=0.183 loss=7.270\n", - "[2024-02-23 18:27:10,798 35174:140704541179520][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([2, 1]), 'y': tensor([4, 2])} weight=0.203 loss=2.696\n" + "\u001b[2m[2024-03-14 12:18:09,033 62058:140704275689088][base.py:56 todd.IterBasedTrainer.optimize_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:09,037 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([4, 7]), 'y': tensor([ 8, 14])} weight=0.000 loss=11.000\n", + "[2024-03-14 12:18:09,040 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([6, 3]), 'y': tensor([12, 6])} weight=0.027 loss=8.876\n", + "[2024-03-14 12:18:09,042 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([1, 2]), 'y': tensor([2, 4])} weight=0.050 loss=2.925\n", + "[2024-03-14 12:18:09,043 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([ 5, 10]), 'y': tensor([10, 20])} weight=0.057 loss=14.569\n", + "[2024-03-14 12:18:09,045 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([8, 9]), 'y': tensor([16, 18])} weight=0.095 loss=16.193\n", + "[2024-03-14 12:18:09,047 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([7, 2]), 'y': tensor([14, 4])} weight=0.138 loss=8.381\n", + "[2024-03-14 12:18:09,049 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([ 8, 10]), 'y': tensor([16, 20])} weight=0.160 loss=16.560\n", + "[2024-03-14 12:18:09,050 62058:140704275689088][log.py:93 todd.IterBasedTrainer.optimize_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([9, 5]), 'y': tensor([18, 10])} weight=0.205 loss=12.565\n" ] } ], @@ -1021,15 +1253,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:10,810 35174:140704541179520][base.py:57 todd.IterBasedTrainer.lr_schedule_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:10,812 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([6, 3]), 'y': tensor([12, 6])} weight=0.000 loss=9.000 lr=['1.667e-03']\n", - "[2024-02-23 18:27:10,814 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([2, 9]), 'y': tensor([ 4, 18])} weight=0.008 loss=10.959 lr=['2.333e-03']\n", - "[2024-02-23 18:27:10,815 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([7, 5]), 'y': tensor([14, 10])} weight=0.020 loss=11.878 lr=['3.000e-03']\n", - "[2024-02-23 18:27:10,817 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([8, 4]), 'y': tensor([16, 8])} weight=0.038 loss=11.770 lr=['3.667e-03']\n", - "[2024-02-23 18:27:10,819 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([ 1, 10]), 'y': tensor([ 2, 20])} weight=0.060 loss=10.668 lr=['4.333e-03']\n", - "[2024-02-23 18:27:10,820 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([10, 6]), 'y': tensor([20, 12])} weight=0.084 loss=15.327 lr=['5.000e-03']\n", - "[2024-02-23 18:27:10,822 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([4, 7]), 'y': tensor([ 8, 14])} weight=0.124 loss=10.317 lr=['5.000e-03']\n", - "[2024-02-23 18:27:10,823 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([1, 9]), 'y': tensor([ 2, 18])} weight=0.152 loss=9.242 lr=['5.000e-03']\n" + "\u001b[2m[2024-03-14 12:18:09,063 62058:140704275689088][base.py:56 todd.IterBasedTrainer.lr_schedule_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:09,066 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([6, 2]), 'y': tensor([12, 4])} weight=0.000 loss=8.000 lr=['1.667e-03']\n", + "[2024-03-14 12:18:09,068 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([ 9, 10]), 'y': tensor([18, 20])} weight=0.007 loss=18.937 lr=['2.333e-03']\n", + "[2024-03-14 12:18:09,070 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.029 loss=10.841 lr=['3.000e-03']\n", + "[2024-03-14 12:18:09,073 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.045 loss=11.728 lr=['3.667e-03']\n", + "[2024-03-14 12:18:09,074 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([1, 4]), 'y': tensor([2, 8])} weight=0.067 loss=4.832 lr=['4.333e-03']\n", + "[2024-03-14 12:18:09,077 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([1, 9]), 'y': tensor([ 2, 18])} weight=0.078 loss=9.609 lr=['5.000e-03']\n", + "[2024-03-14 12:18:09,079 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.103 loss=4.742 lr=['5.000e-03']\n", + "[2024-03-14 12:18:09,080 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([7, 8]), 'y': tensor([14, 16])} weight=0.116 loss=14.132 lr=['5.000e-03']\n" ] } ], @@ -1069,22 +1301,22 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:10,836 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.lr_schedule_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:10,836 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [1/5]\n", - "[2024-02-23 18:27:10,839 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [1/10] batch={'x': tensor([2, 1]), 'y': tensor([4, 2])} weight=0.000 loss=3.000 lr=['1.667e-03']\n", - "[2024-02-23 18:27:10,841 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [2/10] batch={'x': tensor([4, 3]), 'y': tensor([8, 6])} weight=0.002 loss=6.991 lr=['1.667e-03']\n", - "[2024-02-23 18:27:10,842 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [2/5]\n", - "[2024-02-23 18:27:10,844 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [3/10] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.008 loss=4.979 lr=['2.778e-03']\n", - "[2024-02-23 18:27:10,846 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [4/10] batch={'x': tensor([1, 4]), 'y': tensor([2, 8])} weight=0.015 loss=4.962 lr=['2.778e-03']\n", - "[2024-02-23 18:27:10,847 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [3/5]\n", - "[2024-02-23 18:27:10,849 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [5/10] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.022 loss=4.944 lr=['3.889e-03']\n", - "[2024-02-23 18:27:10,851 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [6/10] batch={'x': tensor([2, 3]), 'y': tensor([4, 6])} weight=0.032 loss=4.920 lr=['3.889e-03']\n", - "[2024-02-23 18:27:10,852 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [4/5]\n", - "[2024-02-23 18:27:10,854 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [7/10] batch={'x': tensor([2, 1]), 'y': tensor([4, 2])} weight=0.042 loss=2.938 lr=['5.000e-03']\n", - "[2024-02-23 18:27:10,856 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [8/10] batch={'x': tensor([4, 3]), 'y': tensor([8, 6])} weight=0.049 loss=6.828 lr=['5.000e-03']\n", - "[2024-02-23 18:27:10,857 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [5/5]\n", - "[2024-02-23 18:27:10,859 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [9/10] batch={'x': tensor([2, 1]), 'y': tensor([4, 2])} weight=0.067 loss=2.900 lr=['5.000e-03']\n", - "[2024-02-23 18:27:10,861 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [10/10] batch={'x': tensor([3, 4]), 'y': tensor([6, 8])} weight=0.074 loss=6.740 lr=['5.000e-03']\n" + "\u001b[2m[2024-03-14 12:18:09,092 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.lr_schedule_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:09,094 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [1/5]\n", + "[2024-03-14 12:18:09,096 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [1/10] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.000 loss=5.000 lr=['1.667e-03']\n", + "[2024-03-14 12:18:09,098 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [2/10] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.004 loss=4.990 lr=['1.667e-03']\n", + "[2024-03-14 12:18:09,099 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [2/5]\n", + "[2024-03-14 12:18:09,101 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [3/10] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.008 loss=4.979 lr=['2.778e-03']\n", + "[2024-03-14 12:18:09,102 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [4/10] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.015 loss=4.962 lr=['2.778e-03']\n", + "[2024-03-14 12:18:09,104 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [3/5]\n", + "[2024-03-14 12:18:09,106 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [5/10] batch={'x': tensor([1, 2]), 'y': tensor([2, 4])} weight=0.022 loss=2.967 lr=['3.889e-03']\n", + "[2024-03-14 12:18:09,108 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [6/10] batch={'x': tensor([3, 4]), 'y': tensor([6, 8])} weight=0.028 loss=6.902 lr=['3.889e-03']\n", + "[2024-03-14 12:18:09,109 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [4/5]\n", + "[2024-03-14 12:18:09,112 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [7/10] batch={'x': tensor([1, 2]), 'y': tensor([2, 4])} weight=0.042 loss=2.938 lr=['5.000e-03']\n", + "[2024-03-14 12:18:09,113 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [8/10] batch={'x': tensor([4, 3]), 'y': tensor([8, 6])} weight=0.049 loss=6.828 lr=['5.000e-03']\n", + "[2024-03-14 12:18:09,114 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.lr_schedule_callback before_run_epoch] INFO: Epoch [5/5]\n", + "[2024-03-14 12:18:09,117 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [9/10] batch={'x': tensor([2, 4]), 'y': tensor([4, 8])} weight=0.067 loss=5.800 lr=['5.000e-03']\n", + "[2024-03-14 12:18:09,119 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.lr_schedule_callback after_run_iter] INFO: Iter [10/10] batch={'x': tensor([1, 3]), 'y': tensor([2, 6])} weight=0.082 loss=3.837 lr=['5.000e-03']\n" ] } ], @@ -1132,16 +1364,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:27:10,874 35174:140704541179520][lr.py:93 todd.IterBasedTrainer.lr_scale_callback _scale_lr] INFO: base_batch_size=1 batch_size=2 lr_scaler=2.000\n", - "\u001b[2m[2024-02-23 18:27:10,875 35174:140704541179520][base.py:57 todd.IterBasedTrainer.lr_scale_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:10,877 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([3, 7]), 'y': tensor([ 6, 14])} weight=0.000 loss=10.000\n", - "[2024-02-23 18:27:10,879 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([4, 9]), 'y': tensor([ 8, 18])} weight=0.050 loss=12.675\n", - "[2024-02-23 18:27:10,881 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([1, 6]), 'y': tensor([ 2, 12])} weight=0.115 loss=6.598\n", - "[2024-02-23 18:27:10,882 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([ 2, 10]), 'y': tensor([ 4, 20])} weight=0.150 loss=11.100\n", - "[2024-02-23 18:27:10,884 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([8, 5]), 'y': tensor([16, 10])} weight=0.210 loss=11.635\n", - "[2024-02-23 18:27:10,885 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([7, 1]), 'y': tensor([14, 2])} weight=0.275 loss=6.900\n", - "[2024-02-23 18:27:10,887 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([8, 3]), 'y': tensor([16, 6])} weight=0.315 loss=9.267\n", - "[2024-02-23 18:27:10,888 35174:140704541179520][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([6, 9]), 'y': tensor([12, 18])} weight=0.370 loss=12.225\n" + "[2024-03-14 12:18:09,131 62058:140704275689088][lr.py:93 todd.IterBasedTrainer.lr_scale_callback _scale_lr] INFO: base_batch_size=1 batch_size=2 lr_scaler=2.000\n", + "\u001b[2m[2024-03-14 12:18:09,132 62058:140704275689088][base.py:56 todd.IterBasedTrainer.lr_scale_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:09,134 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([3, 4]), 'y': tensor([6, 8])} weight=0.000 loss=7.000\n", + "[2024-03-14 12:18:09,136 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([6, 7]), 'y': tensor([12, 14])} weight=0.035 loss=12.773\n", + "[2024-03-14 12:18:09,138 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([10, 2]), 'y': tensor([20, 4])} weight=0.100 loss=11.400\n", + "[2024-03-14 12:18:09,140 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([8, 9]), 'y': tensor([16, 18])} weight=0.160 loss=15.640\n", + "[2024-03-14 12:18:09,142 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([5, 1]), 'y': tensor([10, 2])} weight=0.245 loss=5.265\n", + "[2024-03-14 12:18:09,145 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([6, 8]), 'y': tensor([12, 16])} weight=0.275 loss=12.075\n", + "[2024-03-14 12:18:09,146 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([10, 3]), 'y': tensor([20, 6])} weight=0.345 loss=10.757\n", + "[2024-03-14 12:18:09,148 62058:140704275689088][log.py:93 todd.IterBasedTrainer.lr_scale_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([2, 4]), 'y': tensor([4, 8])} weight=0.410 loss=4.770\n" ] } ], @@ -1188,23 +1420,29 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:10,910 35174:140704541179520][base.py:57 todd.IterBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:10,912 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.000 loss=5.000\n", - "[2024-02-23 18:27:10,913 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_1\n", - "[2024-02-23 18:27:10,917 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([6, 7]), 'y': tensor([12, 14])} weight=0.012 loss=12.919\n", - "[2024-02-23 18:27:10,918 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_2\n", - "[2024-02-23 18:27:10,922 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([ 9, 10]), 'y': tensor([18, 20])} weight=0.045 loss=18.572\n", - "[2024-02-23 18:27:10,923 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_3\n", - "[2024-02-23 18:27:10,928 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([8, 5]), 'y': tensor([16, 10])} weight=0.093 loss=12.399\n", - "[2024-02-23 18:27:10,930 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_4\n", - "[2024-02-23 18:27:10,936 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.125 loss=4.688\n", - "[2024-02-23 18:27:10,937 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_5\n", - "[2024-02-23 18:27:10,941 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([6, 3]), 'y': tensor([12, 6])} weight=0.138 loss=8.381\n", - "[2024-02-23 18:27:10,942 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_6\n", - "[2024-02-23 18:27:10,946 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([4, 7]), 'y': tensor([ 8, 14])} weight=0.160 loss=10.120\n", - "[2024-02-23 18:27:10,947 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_7\n", - "[2024-02-23 18:27:10,951 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([ 2, 10]), 'y': tensor([ 4, 20])} weight=0.188 loss=10.875\n", - "[2024-02-23 18:27:10,952 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_8\n" + "\u001b[2m[2024-03-14 12:18:09,174 62058:140704275689088][base.py:56 todd.IterBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:09,177 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/8] batch={'x': tensor([6, 5]), 'y': tensor([12, 10])} weight=0.000 loss=11.000\n", + "[2024-03-14 12:18:09,195 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_1\n", + "[2024-03-14 12:18:09,224 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/8] batch={'x': tensor([2, 8]), 'y': tensor([ 4, 16])} weight=0.027 loss=9.863\n", + "[2024-03-14 12:18:09,233 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_2\n", + "[2024-03-14 12:18:09,241 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [3/8] batch={'x': tensor([ 3, 10]), 'y': tensor([ 6, 20])} weight=0.052 loss=12.659\n", + "[2024-03-14 12:18:09,243 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_3\n", + "[2024-03-14 12:18:09,251 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [4/8] batch={'x': tensor([1, 4]), 'y': tensor([2, 8])} weight=0.085 loss=4.787\n", + "[2024-03-14 12:18:09,258 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_4\n", + "[2024-03-14 12:18:09,262 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [5/8] batch={'x': tensor([9, 7]), 'y': tensor([18, 14])} weight=0.097 loss=15.220\n", + "[2024-03-14 12:18:09,264 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_5\n", + "[2024-03-14 12:18:09,268 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([9, 7]), 'y': tensor([18, 14])} weight=0.137 loss=14.900\n", + "[2024-03-14 12:18:09,269 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_6\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-03-14 12:18:09,274 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([1, 3]), 'y': tensor([2, 6])} weight=0.177 loss=3.645\n", + "[2024-03-14 12:18:09,276 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_7\n", + "[2024-03-14 12:18:09,281 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([5, 8]), 'y': tensor([10, 16])} weight=0.187 loss=11.781\n", + "[2024-03-14 12:18:09,282 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_8\n" ] }, { @@ -1212,7 +1450,7 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp\u001b[0m\n", + "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460\u001b[0m\n", "└── \u001b[1;36mcheckpoint_callback\u001b[0m\n", " └── \u001b[1;36mcheckpoints\u001b[0m\n", " ├── \u001b[1;36miter_1\u001b[0m\n", @@ -1273,15 +1511,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:27:11,413 35174:140704541179520][checkpoint.py:54 todd.IterBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_5\n", - "[2024-02-23 18:27:11,420 35174:140704541179520][base.py:65 todd.IterBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", - "\u001b[2m[2024-02-23 18:27:11,421 35174:140704541179520][base.py:57 todd.IterBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:11,424 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([3, 7]), 'y': tensor([ 6, 14])} weight=0.138 loss=9.312\n", - "[2024-02-23 18:27:11,425 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_6\n", - "[2024-02-23 18:27:11,429 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([2, 6]), 'y': tensor([ 4, 12])} weight=0.162 loss=7.350\n", - "[2024-02-23 18:27:11,430 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_7\n", - "[2024-02-23 18:27:11,434 35174:140704541179520][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([8, 4]), 'y': tensor([16, 8])} weight=0.182 loss=10.905\n", - "[2024-02-23 18:27:11,435 35174:140704541179520][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp5g2sg2lp/checkpoint_callback/checkpoints/iter_8\n" + "[2024-03-14 12:18:09,715 62058:140704275689088][checkpoint.py:54 todd.IterBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_5\n", + "[2024-03-14 12:18:09,718 62058:140704275689088][base.py:65 todd.IterBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", + "\u001b[2m[2024-03-14 12:18:09,719 62058:140704275689088][base.py:56 todd.IterBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:09,722 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/8] batch={'x': tensor([10, 1]), 'y': tensor([20, 2])} weight=0.137 loss=10.244\n", + "[2024-03-14 12:18:09,723 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_6\n", + "[2024-03-14 12:18:09,727 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/8] batch={'x': tensor([4, 2]), 'y': tensor([8, 4])} weight=0.165 loss=5.505\n", + "[2024-03-14 12:18:09,728 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_7\n", + "[2024-03-14 12:18:09,731 62058:140704275689088][log.py:93 todd.IterBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/8] batch={'x': tensor([6, 9]), 'y': tensor([12, 18])} weight=0.180 loss=13.650\n", + "[2024-03-14 12:18:09,732 62058:140704275689088][checkpoint.py:80 todd.IterBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpkhq8r460/checkpoint_callback/checkpoints/iter_8\n" ] }, { @@ -1364,32 +1602,32 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:11,475 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:11,476 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [1/3]\n", - "[2024-02-23 18:27:11,478 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/15] batch={'x': tensor([10, 1]), 'y': tensor([20, 2])} weight=0.000 loss=11.000\n", - "[2024-02-23 18:27:11,480 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/15] batch={'x': tensor([5, 3]), 'y': tensor([10, 6])} weight=0.027 loss=7.890\n", - "[2024-02-23 18:27:11,481 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_2\n", - "[2024-02-23 18:27:11,484 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [3/15] batch={'x': tensor([7, 4]), 'y': tensor([14, 8])} weight=0.047 loss=10.739\n", - "[2024-02-23 18:27:11,486 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [4/15] batch={'x': tensor([9, 2]), 'y': tensor([18, 4])} weight=0.075 loss=10.588\n", - "[2024-02-23 18:27:11,487 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_4\n", - "[2024-02-23 18:27:11,491 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [5/15] batch={'x': tensor([6, 8]), 'y': tensor([12, 16])} weight=0.103 loss=13.283\n", - "[2024-02-23 18:27:11,492 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [2/3]\n", - "[2024-02-23 18:27:11,494 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/15] batch={'x': tensor([7, 2]), 'y': tensor([14, 4])} weight=0.138 loss=8.381\n", - "[2024-02-23 18:27:11,495 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_6\n", - "[2024-02-23 18:27:11,499 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/15] batch={'x': tensor([ 3, 10]), 'y': tensor([ 6, 20])} weight=0.160 loss=11.960\n", - "[2024-02-23 18:27:11,500 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/15] batch={'x': tensor([4, 6]), 'y': tensor([ 8, 12])} weight=0.192 loss=9.038\n", - "[2024-02-23 18:27:11,502 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_8\n", - "[2024-02-23 18:27:11,506 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [9/15] batch={'x': tensor([8, 5]), 'y': tensor([16, 10])} weight=0.218 loss=11.586\n", - "[2024-02-23 18:27:11,508 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [10/15] batch={'x': tensor([9, 1]), 'y': tensor([18, 2])} weight=0.250 loss=8.750\n", - "[2024-02-23 18:27:11,509 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_10\n", - "[2024-02-23 18:27:11,512 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:27:11,514 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([1, 8]), 'y': tensor([ 2, 16])} weight=0.275 loss=7.763\n", - "[2024-02-23 18:27:11,516 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([2, 6]), 'y': tensor([ 4, 12])} weight=0.298 loss=6.810\n", - "[2024-02-23 18:27:11,516 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_12\n", - "[2024-02-23 18:27:11,521 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.318 loss=10.095\n", - "[2024-02-23 18:27:11,522 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([ 4, 10]), 'y': tensor([ 8, 20])} weight=0.348 loss=11.567\n", - "[2024-02-23 18:27:11,523 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_14\n", - "[2024-02-23 18:27:11,527 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.383 loss=9.705\n" + "\u001b[2m[2024-03-14 12:18:09,769 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:09,769 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [1/3]\n", + "[2024-03-14 12:18:09,772 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/15] batch={'x': tensor([5, 3]), 'y': tensor([10, 6])} weight=0.000 loss=8.000\n", + "[2024-03-14 12:18:09,774 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/15] batch={'x': tensor([2, 6]), 'y': tensor([ 4, 12])} weight=0.020 loss=7.920\n", + "[2024-03-14 12:18:09,775 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_2\n", + "[2024-03-14 12:18:09,779 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [3/15] batch={'x': tensor([4, 7]), 'y': tensor([ 8, 14])} weight=0.040 loss=10.780\n", + "[2024-03-14 12:18:09,781 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [4/15] batch={'x': tensor([10, 1]), 'y': tensor([20, 2])} weight=0.067 loss=10.629\n", + "[2024-03-14 12:18:09,782 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_4\n", + "[2024-03-14 12:18:09,786 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [5/15] batch={'x': tensor([9, 8]), 'y': tensor([18, 16])} weight=0.095 loss=16.193\n", + "[2024-03-14 12:18:09,787 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [2/3]\n", + "[2024-03-14 12:18:09,789 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/15] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.138 loss=11.175\n", + "[2024-03-14 12:18:09,790 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_6\n", + "[2024-03-14 12:18:09,794 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/15] batch={'x': tensor([1, 6]), 'y': tensor([ 2, 12])} weight=0.168 loss=6.414\n", + "[2024-03-14 12:18:09,796 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.185 loss=10.890\n", + "[2024-03-14 12:18:09,797 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_8\n", + "[2024-03-14 12:18:09,823 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [9/15] batch={'x': tensor([8, 2]), 'y': tensor([16, 4])} weight=0.215 loss=8.925\n", + "[2024-03-14 12:18:09,825 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [10/15] batch={'x': tensor([10, 4]), 'y': tensor([20, 8])} weight=0.240 loss=12.320\n", + "[2024-03-14 12:18:09,833 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_10\n", + "[2024-03-14 12:18:09,842 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:18:09,844 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([5, 9]), 'y': tensor([10, 18])} weight=0.275 loss=12.075\n", + "[2024-03-14 12:18:09,846 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([7, 2]), 'y': tensor([14, 4])} weight=0.310 loss=7.605\n", + "[2024-03-14 12:18:09,857 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_12\n", + "[2024-03-14 12:18:09,861 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([6, 3]), 'y': tensor([12, 6])} weight=0.333 loss=7.504\n", + "[2024-03-14 12:18:09,863 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([ 1, 10]), 'y': tensor([ 2, 20])} weight=0.355 loss=9.048\n", + "[2024-03-14 12:18:09,864 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_14\n", + "[2024-03-14 12:18:09,868 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([8, 4]), 'y': tensor([16, 8])} weight=0.383 loss=9.705\n" ] }, { @@ -1397,7 +1635,7 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj\u001b[0m\n", + "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9\u001b[0m\n", "└── \u001b[1;36mcheckpoint_callback\u001b[0m\n", " └── \u001b[1;36mcheckpoints\u001b[0m\n", " ├── \u001b[1;36miter_10\u001b[0m\n", @@ -1452,21 +1690,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:27:11,956 35174:140704541179520][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_8\n", - "[2024-02-23 18:27:11,959 35174:140704541179520][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", - "\u001b[2m[2024-02-23 18:27:11,960 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:11,961 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [2/3]\n", - "[2024-02-23 18:27:11,964 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [9/15] batch={'x': tensor([6, 4]), 'y': tensor([12, 8])} weight=0.218 loss=8.913\n", - "[2024-02-23 18:27:11,966 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [10/15] batch={'x': tensor([ 1, 10]), 'y': tensor([ 2, 20])} weight=0.243 loss=9.666\n", - "[2024-02-23 18:27:11,967 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_10\n", - "[2024-02-23 18:27:11,970 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:27:11,972 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([10, 7]), 'y': tensor([20, 14])} weight=0.270 loss=14.705\n", - "[2024-02-23 18:27:11,973 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([6, 8]), 'y': tensor([12, 16])} weight=0.312 loss=11.812\n", - "[2024-02-23 18:27:11,974 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_12\n", - "[2024-02-23 18:27:11,978 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([1, 9]), 'y': tensor([ 2, 18])} weight=0.347 loss=8.262\n", - "[2024-02-23 18:27:11,980 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([4, 2]), 'y': tensor([8, 4])} weight=0.373 loss=4.883\n", - "[2024-02-23 18:27:11,982 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_14\n", - "[2024-02-23 18:27:11,987 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([5, 3]), 'y': tensor([10, 6])} weight=0.387 loss=6.450\n" + "[2024-03-14 12:18:10,289 62058:140704275689088][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_8\n", + "[2024-03-14 12:18:10,293 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", + "\u001b[2m[2024-03-14 12:18:10,294 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:10,295 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [2/3]\n", + "[2024-03-14 12:18:10,298 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [9/15] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.215 loss=9.818\n", + "[2024-03-14 12:18:10,300 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [10/15] batch={'x': tensor([5, 4]), 'y': tensor([10, 8])} weight=0.243 loss=7.909\n", + "[2024-03-14 12:18:10,300 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_10\n", + "[2024-03-14 12:18:10,303 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:18:10,305 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([7, 4]), 'y': tensor([14, 8])} weight=0.265 loss=9.542\n", + "[2024-03-14 12:18:10,307 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([10, 2]), 'y': tensor([20, 4])} weight=0.293 loss=10.245\n", + "[2024-03-14 12:18:10,307 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_12\n", + "[2024-03-14 12:18:10,311 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([6, 3]), 'y': tensor([12, 6])} weight=0.323 loss=7.549\n", + "[2024-03-14 12:18:10,312 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([5, 9]), 'y': tensor([10, 18])} weight=0.345 loss=11.585\n", + "[2024-03-14 12:18:10,313 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_14\n", + "[2024-03-14 12:18:10,316 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([1, 8]), 'y': tensor([ 2, 16])} weight=0.380 loss=7.290\n" ] }, { @@ -1482,17 +1720,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:27:12,442 35174:140704541179520][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_10\n", - "[2024-02-23 18:27:12,445 35174:140704541179520][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", - "\u001b[2m[2024-02-23 18:27:12,446 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:12,447 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:27:12,450 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([6, 2]), 'y': tensor([12, 4])} weight=0.270 loss=6.920\n", - "[2024-02-23 18:27:12,452 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.290 loss=10.260\n", - "[2024-02-23 18:27:12,453 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_12\n", - "[2024-02-23 18:27:12,463 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([10, 4]), 'y': tensor([20, 8])} weight=0.320 loss=11.760\n", - "[2024-02-23 18:27:12,465 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.355 loss=9.047\n", - "[2024-02-23 18:27:12,466 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp46q6bkgj/checkpoint_callback/checkpoints/iter_14\n", - "[2024-02-23 18:27:12,470 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([1, 9]), 'y': tensor([ 2, 18])} weight=0.383 loss=8.087\n" + "[2024-03-14 12:18:10,730 62058:140704275689088][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_10\n", + "[2024-03-14 12:18:10,734 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", + "\u001b[2m[2024-03-14 12:18:10,734 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:10,736 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:18:10,740 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([7, 6]), 'y': tensor([14, 12])} weight=0.265 loss=11.278\n", + "[2024-03-14 12:18:10,742 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([1, 4]), 'y': tensor([2, 8])} weight=0.298 loss=4.256\n", + "[2024-03-14 12:18:10,743 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_12\n", + "[2024-03-14 12:18:10,748 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([8, 5]), 'y': tensor([16, 10])} weight=0.310 loss=10.985\n", + "[2024-03-14 12:18:10,750 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([ 2, 10]), 'y': tensor([ 4, 20])} weight=0.343 loss=9.945\n", + "[2024-03-14 12:18:10,751 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpn0pz4ax9/checkpoint_callback/checkpoints/iter_14\n", + "[2024-03-14 12:18:10,756 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.373 loss=9.765\n" ] } ], @@ -1553,28 +1791,28 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:12,498 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:12,499 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [1/3]\n", - "[2024-02-23 18:27:12,501 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/15] batch={'x': tensor([3, 1]), 'y': tensor([6, 2])} weight=0.000 loss=4.000\n", - "[2024-02-23 18:27:12,503 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/15] batch={'x': tensor([8, 5]), 'y': tensor([16, 10])} weight=0.010 loss=12.935\n", - "[2024-02-23 18:27:12,505 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [3/15] batch={'x': tensor([2, 7]), 'y': tensor([ 4, 14])} weight=0.042 loss=8.809\n", - "[2024-02-23 18:27:12,507 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [4/15] batch={'x': tensor([ 6, 10]), 'y': tensor([12, 20])} weight=0.065 loss=15.480\n", - "[2024-02-23 18:27:12,509 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [5/15] batch={'x': tensor([9, 4]), 'y': tensor([18, 8])} weight=0.105 loss=12.318\n", - "[2024-02-23 18:27:12,510 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp8jm3x0vu/checkpoint_callback/checkpoints/epoch_1\n", - "[2024-02-23 18:27:12,513 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [2/3]\n", - "[2024-02-23 18:27:12,515 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/15] batch={'x': tensor([1, 5]), 'y': tensor([ 2, 10])} weight=0.137 loss=5.588\n", - "[2024-02-23 18:27:12,517 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/15] batch={'x': tensor([10, 2]), 'y': tensor([20, 4])} weight=0.152 loss=11.085\n", - "[2024-02-23 18:27:12,519 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/15] batch={'x': tensor([4, 8]), 'y': tensor([ 8, 16])} weight=0.182 loss=10.905\n", - "[2024-02-23 18:27:12,521 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [9/15] batch={'x': tensor([3, 7]), 'y': tensor([ 6, 14])} weight=0.212 loss=8.938\n", - "[2024-02-23 18:27:12,523 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [10/15] batch={'x': tensor([6, 9]), 'y': tensor([12, 18])} weight=0.237 loss=13.219\n", - "[2024-02-23 18:27:12,525 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp8jm3x0vu/checkpoint_callback/checkpoints/epoch_2\n", - "[2024-02-23 18:27:12,528 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:27:12,530 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([ 1, 10]), 'y': tensor([ 2, 20])} weight=0.275 loss=9.488\n", - "[2024-02-23 18:27:12,532 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([4, 6]), 'y': tensor([ 8, 12])} weight=0.302 loss=8.488\n", - "[2024-02-23 18:27:12,534 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([8, 9]), 'y': tensor([16, 18])} weight=0.327 loss=14.216\n", - "[2024-02-23 18:27:12,535 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([7, 5]), 'y': tensor([14, 10])} weight=0.370 loss=9.780\n", - "[2024-02-23 18:27:12,537 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.400 loss=4.000\n", - "[2024-02-23 18:27:12,538 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp8jm3x0vu/checkpoint_callback/checkpoints/epoch_3\n" + "\u001b[2m[2024-03-14 12:18:10,784 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:10,785 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [1/3]\n", + "[2024-03-14 12:18:10,789 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [1/15] batch={'x': tensor([4, 5]), 'y': tensor([ 8, 10])} weight=0.000 loss=9.000\n", + "[2024-03-14 12:18:10,793 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [2/15] batch={'x': tensor([3, 6]), 'y': tensor([ 6, 12])} weight=0.022 loss=8.899\n", + "[2024-03-14 12:18:10,797 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [3/15] batch={'x': tensor([ 9, 10]), 'y': tensor([18, 20])} weight=0.045 loss=18.572\n", + "[2024-03-14 12:18:10,799 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [4/15] batch={'x': tensor([8, 2]), 'y': tensor([16, 4])} weight=0.093 loss=9.538\n", + "[2024-03-14 12:18:10,802 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [5/15] batch={'x': tensor([1, 7]), 'y': tensor([ 2, 14])} weight=0.117 loss=7.530\n", + "[2024-03-14 12:18:10,803 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz/checkpoint_callback/checkpoints/epoch_1\n", + "[2024-03-14 12:18:10,808 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [2/3]\n", + "[2024-03-14 12:18:10,812 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [6/15] batch={'x': tensor([4, 1]), 'y': tensor([8, 2])} weight=0.138 loss=4.656\n", + "[2024-03-14 12:18:10,816 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [7/15] batch={'x': tensor([7, 5]), 'y': tensor([14, 10])} weight=0.150 loss=11.100\n", + "[2024-03-14 12:18:10,820 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [8/15] batch={'x': tensor([3, 9]), 'y': tensor([ 6, 18])} weight=0.180 loss=10.920\n", + "[2024-03-14 12:18:10,822 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [9/15] batch={'x': tensor([6, 2]), 'y': tensor([12, 4])} weight=0.210 loss=7.160\n", + "[2024-03-14 12:18:10,824 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [10/15] batch={'x': tensor([10, 8]), 'y': tensor([20, 16])} weight=0.230 loss=15.930\n", + "[2024-03-14 12:18:10,826 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz/checkpoint_callback/checkpoints/epoch_2\n", + "[2024-03-14 12:18:10,829 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:18:10,833 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([2, 9]), 'y': tensor([ 4, 18])} weight=0.275 loss=9.488\n", + "[2024-03-14 12:18:10,835 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([6, 5]), 'y': tensor([12, 10])} weight=0.303 loss=9.336\n", + "[2024-03-14 12:18:10,837 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([3, 8]), 'y': tensor([ 6, 16])} weight=0.330 loss=9.185\n", + "[2024-03-14 12:18:10,839 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([ 4, 10]), 'y': tensor([ 8, 20])} weight=0.358 loss=11.497\n", + "[2024-03-14 12:18:10,841 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([7, 1]), 'y': tensor([14, 2])} weight=0.393 loss=6.430\n", + "[2024-03-14 12:18:10,843 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz/checkpoint_callback/checkpoints/epoch_3\n" ] }, { @@ -1582,7 +1820,7 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp8jm3x0vu\u001b[0m\n", + "\u001b[1;36m/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz\u001b[0m\n", "└── \u001b[1;36mcheckpoint_callback\u001b[0m\n", " └── \u001b[1;36mcheckpoints\u001b[0m\n", " ├── \u001b[1;36mepoch_1\u001b[0m\n", @@ -1613,16 +1851,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "[2024-02-23 18:27:12,968 35174:140704541179520][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp8jm3x0vu/checkpoint_callback/checkpoints/epoch_2\n", - "[2024-02-23 18:27:12,974 35174:140704541179520][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", - "\u001b[2m[2024-02-23 18:27:12,976 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:12,978 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:27:12,982 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([1, 8]), 'y': tensor([ 2, 16])} weight=0.275 loss=7.763\n", - "[2024-02-23 18:27:12,984 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([2, 7]), 'y': tensor([ 4, 14])} weight=0.297 loss=7.661\n", - "[2024-02-23 18:27:12,986 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.320 loss=10.080\n", - "[2024-02-23 18:27:12,989 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([5, 6]), 'y': tensor([10, 12])} weight=0.350 loss=9.075\n", - "[2024-02-23 18:27:12,991 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([10, 4]), 'y': tensor([20, 8])} weight=0.377 loss=11.358\n", - "[2024-02-23 18:27:12,992 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp8jm3x0vu/checkpoint_callback/checkpoints/epoch_3\n" + "[2024-03-14 12:18:11,264 62058:140704275689088][checkpoint.py:54 todd.EpochBasedTrainer.checkpoint_callback init] INFO: Loading from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz/checkpoint_callback/checkpoints/epoch_2\n", + "[2024-03-14 12:18:11,267 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.checkpoint_callback load_model_state_dict] INFO: \n", + "\u001b[2m[2024-03-14 12:18:11,268 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.checkpoint_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:11,269 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.checkpoint_callback before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:18:11,279 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [11/15] batch={'x': tensor([6, 2]), 'y': tensor([12, 4])} weight=0.275 loss=6.900\n", + "[2024-03-14 12:18:11,291 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [12/15] batch={'x': tensor([3, 5]), 'y': tensor([ 6, 10])} weight=0.295 loss=6.820\n", + "[2024-03-14 12:18:11,305 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [13/15] batch={'x': tensor([1, 4]), 'y': tensor([2, 8])} weight=0.315 loss=4.212\n", + "[2024-03-14 12:18:11,312 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [14/15] batch={'x': tensor([7, 9]), 'y': tensor([14, 18])} weight=0.328 loss=13.380\n", + "[2024-03-14 12:18:11,315 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.checkpoint_callback after_run_iter] INFO: Iter [15/15] batch={'x': tensor([10, 8]), 'y': tensor([20, 16])} weight=0.368 loss=14.693\n", + "[2024-03-14 12:18:11,317 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.checkpoint_callback _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmplwjo14oz/checkpoint_callback/checkpoints/epoch_3\n" ] } ], @@ -1700,15 +1938,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:13,032 35174:140704541179520][base.py:57 todd.FaultyValidator.monitor_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "\u001b[1;31m[2024-02-23 18:27:13,035 35174:140704541179520][monitor.py:26 todd.FaultyValidator.monitor_callback __exit__] ERROR: Unable to run iter_=1\n", + "\u001b[2m[2024-03-14 12:18:11,358 62058:140704275689088][base.py:56 todd.FaultyValidator.monitor_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "\u001b[1;31m[2024-03-14 12:18:11,359 62058:140704275689088][monitor.py:26 todd.FaultyValidator.monitor_callback __exit__] ERROR: Unable to run iter_=1\n", "batch={'x': tensor([1]), 'y': tensor([2])}\n", - "memo={'dataloader': }\n", + "memo={'dataloader': }\n", "Traceback (most recent call last):\n", - " File \"/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/base.py\", line 255, in _run\n", + " File \"/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/base.py\", line 246, in _run\n", " memo = self._run_iter(batch, memo)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_35174/1715875531.py\", line 5, in _run_iter\n", + " File \"/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_62058/1715875531.py\", line 5, in _run_iter\n", " raise CustomError(\"faulty runner\")\n", "CustomError: faulty runner\u001b[m\n" ] @@ -1718,15 +1956,15 @@ "output_type": "stream", "text": [ "\n", - "[2024-02-23 18:27:13,032 35174:140704541179520][base.py:57 todd.FaultyValidator.monitor_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\n", - "[2024-02-23 18:27:13,035 35174:140704541179520][monitor.py:26 todd.FaultyValidator.monitor_callback __exit__] ERROR: Unable to run iter_=1\n", + "[2024-03-14 12:18:11,358 62058:140704275689088][base.py:56 todd.FaultyValidator.monitor_callback __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\n", + "[2024-03-14 12:18:11,359 62058:140704275689088][monitor.py:26 todd.FaultyValidator.monitor_callback __exit__] ERROR: Unable to run iter_=1\n", "batch={'x': tensor([1]), 'y': tensor([2])}\n", - "memo={'dataloader': }\n", + "memo={'dataloader': }\n", "Traceback (most recent call last):\n", - " File \"/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/base.py\", line 255, in _run\n", + " File \"/Users/bytedance/.local/share/virtualenvs/todd-ARrcnwyq/lib/python3.11/site-packages/todd/runners/base.py\", line 246, in _run\n", " memo = self._run_iter(batch, memo)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_35174/1715875531.py\", line 5, in _run_iter\n", + " File \"/var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/ipykernel_62058/1715875531.py\", line 5, in _run_iter\n", " raise CustomError(\"faulty runner\")\n", "CustomError: faulty runner\n" ] @@ -1780,28 +2018,28 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:13,341 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.strategy_load_model_from __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:13,342 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [1/3]\n", - "[2024-02-23 18:27:13,346 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [1/15] batch={'x': tensor([1, 3]), 'y': tensor([2, 6])} weight=0.000 loss=4.000\n", - "[2024-02-23 18:27:13,347 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [2/15] batch={'x': tensor([6, 2]), 'y': tensor([12, 4])} weight=0.010 loss=7.960\n", - "[2024-02-23 18:27:13,349 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [3/15] batch={'x': tensor([10, 8]), 'y': tensor([20, 16])} weight=0.030 loss=17.730\n", - "[2024-02-23 18:27:13,351 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [4/15] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.075 loss=11.550\n", - "[2024-02-23 18:27:13,367 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [5/15] batch={'x': tensor([4, 9]), 'y': tensor([ 8, 18])} weight=0.105 loss=12.318\n", - "[2024-02-23 18:27:13,385 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp91d3imrz/strategy_load_model_from/checkpoints/epoch_1\n", - "[2024-02-23 18:27:13,414 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [2/3]\n", - "[2024-02-23 18:27:13,419 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [6/15] batch={'x': tensor([5, 2]), 'y': tensor([10, 4])} weight=0.137 loss=6.519\n", - "[2024-02-23 18:27:13,420 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [7/15] batch={'x': tensor([6, 8]), 'y': tensor([12, 16])} weight=0.155 loss=12.915\n", - "[2024-02-23 18:27:13,422 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [8/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.190 loss=10.860\n", - "[2024-02-23 18:27:13,424 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [9/15] batch={'x': tensor([7, 1]), 'y': tensor([14, 2])} weight=0.220 loss=7.120\n", - "[2024-02-23 18:27:13,426 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [10/15] batch={'x': tensor([10, 4]), 'y': tensor([20, 8])} weight=0.240 loss=12.320\n", - "[2024-02-23 18:27:13,427 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp91d3imrz/strategy_load_model_from/checkpoints/epoch_2\n", - "[2024-02-23 18:27:13,430 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:27:13,432 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [11/15] batch={'x': tensor([ 3, 10]), 'y': tensor([ 6, 20])} weight=0.275 loss=11.212\n", - "[2024-02-23 18:27:13,434 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [12/15] batch={'x': tensor([7, 4]), 'y': tensor([14, 8])} weight=0.307 loss=9.309\n", - "[2024-02-23 18:27:13,436 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [13/15] batch={'x': tensor([2, 5]), 'y': tensor([ 4, 10])} weight=0.335 loss=5.827\n", - "[2024-02-23 18:27:13,438 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [14/15] batch={'x': tensor([9, 6]), 'y': tensor([18, 12])} weight=0.352 loss=12.356\n", - "[2024-02-23 18:27:13,440 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [15/15] batch={'x': tensor([8, 1]), 'y': tensor([16, 2])} weight=0.390 loss=7.245\n", - "[2024-02-23 18:27:13,441 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp91d3imrz/strategy_load_model_from/checkpoints/epoch_3\n" + "\u001b[2m[2024-03-14 12:18:11,674 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.strategy_load_model_from __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:11,675 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [1/3]\n", + "[2024-03-14 12:18:11,679 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [1/15] batch={'x': tensor([2, 5]), 'y': tensor([ 4, 10])} weight=0.000 loss=7.000\n", + "[2024-03-14 12:18:11,682 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [2/15] batch={'x': tensor([ 7, 10]), 'y': tensor([14, 20])} weight=0.018 loss=16.851\n", + "[2024-03-14 12:18:11,684 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [3/15] batch={'x': tensor([1, 3]), 'y': tensor([2, 6])} weight=0.060 loss=3.880\n", + "[2024-03-14 12:18:11,687 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [4/15] batch={'x': tensor([8, 4]), 'y': tensor([16, 8])} weight=0.070 loss=11.580\n", + "[2024-03-14 12:18:11,689 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [5/15] batch={'x': tensor([6, 9]), 'y': tensor([12, 18])} weight=0.100 loss=14.250\n", + "[2024-03-14 12:18:11,691 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_1\n", + "[2024-03-14 12:18:11,694 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [2/3]\n", + "[2024-03-14 12:18:11,697 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [6/15] batch={'x': tensor([9, 4]), 'y': tensor([18, 8])} weight=0.138 loss=12.106\n", + "[2024-03-14 12:18:11,700 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [7/15] batch={'x': tensor([ 5, 10]), 'y': tensor([10, 20])} weight=0.170 loss=13.725\n", + "[2024-03-14 12:18:11,702 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [8/15] batch={'x': tensor([3, 2]), 'y': tensor([6, 4])} weight=0.207 loss=4.481\n", + "[2024-03-14 12:18:11,704 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [9/15] batch={'x': tensor([8, 6]), 'y': tensor([16, 12])} weight=0.220 loss=12.460\n", + "[2024-03-14 12:18:11,706 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [10/15] batch={'x': tensor([7, 1]), 'y': tensor([14, 2])} weight=0.255 loss=6.980\n", + "[2024-03-14 12:18:11,708 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_2\n", + "[2024-03-14 12:18:11,711 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:18:11,713 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [11/15] batch={'x': tensor([6, 1]), 'y': tensor([12, 2])} weight=0.275 loss=6.038\n", + "[2024-03-14 12:18:11,714 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [12/15] batch={'x': tensor([7, 9]), 'y': tensor([14, 18])} weight=0.293 loss=13.660\n", + "[2024-03-14 12:18:11,716 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [13/15] batch={'x': tensor([8, 5]), 'y': tensor([16, 10])} weight=0.333 loss=10.839\n", + "[2024-03-14 12:18:11,718 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [14/15] batch={'x': tensor([ 4, 10]), 'y': tensor([ 8, 20])} weight=0.365 loss=11.445\n", + "[2024-03-14 12:18:11,720 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [15/15] batch={'x': tensor([2, 3]), 'y': tensor([4, 6])} weight=0.400 loss=4.000\n", + "[2024-03-14 12:18:11,721 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_3\n" ] }, { @@ -1817,30 +2055,30 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m[2024-02-23 18:27:13,865 35174:140704541179520][base.py:57 todd.EpochBasedTrainer.strategy_load_model_from __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", - "[2024-02-23 18:27:13,866 35174:140704541179520][base.py:80 todd.EpochBasedTrainer.strategy_load_model_from load_model_from] INFO: Loading model from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp91d3imrz/strategy_load_model_from/checkpoints/epoch_2/model.pth\n", - "[2024-02-23 18:27:13,868 35174:140704541179520][base.py:65 todd.EpochBasedTrainer.strategy_load_model_from load_model_state_dict] INFO: \n", - "[2024-02-23 18:27:13,869 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [1/3]\n", - "[2024-02-23 18:27:13,873 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [1/15] batch={'x': tensor([7, 2]), 'y': tensor([14, 4])} weight=0.275 loss=7.762\n", - "[2024-02-23 18:27:13,875 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [2/15] batch={'x': tensor([ 6, 10]), 'y': tensor([12, 20])} weight=0.297 loss=13.620\n", - "[2024-02-23 18:27:13,877 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [3/15] batch={'x': tensor([1, 4]), 'y': tensor([2, 8])} weight=0.337 loss=4.156\n", - "[2024-02-23 18:27:13,881 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [4/15] batch={'x': tensor([8, 9]), 'y': tensor([16, 18])} weight=0.350 loss=14.025\n", - "[2024-02-23 18:27:13,883 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [5/15] batch={'x': tensor([5, 3]), 'y': tensor([10, 6])} weight=0.392 loss=6.430\n", - "[2024-02-23 18:27:13,884 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp91d3imrz/strategy_load_model_from/checkpoints/epoch_1\n", - "[2024-02-23 18:27:13,888 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [2/3]\n", - "[2024-02-23 18:27:13,890 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [6/15] batch={'x': tensor([10, 1]), 'y': tensor([20, 2])} weight=0.412 loss=8.731\n", - "[2024-02-23 18:27:13,891 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [7/15] batch={'x': tensor([5, 4]), 'y': tensor([10, 8])} weight=0.440 loss=7.020\n", - "[2024-02-23 18:27:13,894 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [8/15] batch={'x': tensor([3, 9]), 'y': tensor([ 6, 18])} weight=0.462 loss=9.225\n", - "[2024-02-23 18:27:13,896 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [9/15] batch={'x': tensor([6, 7]), 'y': tensor([12, 14])} weight=0.492 loss=9.799\n", - "[2024-02-23 18:27:13,898 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [10/15] batch={'x': tensor([2, 8]), 'y': tensor([ 4, 16])} weight=0.525 loss=7.375\n", - "[2024-02-23 18:27:13,899 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp91d3imrz/strategy_load_model_from/checkpoints/epoch_2\n", - "[2024-02-23 18:27:13,902 35174:140704541179520][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [3/3]\n", - "[2024-02-23 18:27:13,905 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [11/15] batch={'x': tensor([8, 5]), 'y': tensor([16, 10])} weight=0.550 loss=9.425\n", - "[2024-02-23 18:27:13,907 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [12/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.582 loss=8.505\n", - "[2024-02-23 18:27:13,910 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [13/15] batch={'x': tensor([10, 4]), 'y': tensor([20, 8])} weight=0.612 loss=9.712\n", - "[2024-02-23 18:27:13,912 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [14/15] batch={'x': tensor([1, 6]), 'y': tensor([ 2, 12])} weight=0.647 loss=4.734\n", - "[2024-02-23 18:27:13,914 35174:140704541179520][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [15/15] batch={'x': tensor([2, 7]), 'y': tensor([ 4, 14])} weight=0.665 loss=6.008\n", - "[2024-02-23 18:27:13,915 35174:140704541179520][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmp91d3imrz/strategy_load_model_from/checkpoints/epoch_3\n" + "\u001b[2m[2024-03-14 12:18:12,135 62058:140704275689088][base.py:56 todd.EpochBasedTrainer.strategy_load_model_from __init__] DEBUG: Rank 0 initialized by bytedance@C02G870SMD6R\u001b[m\n", + "[2024-03-14 12:18:12,135 62058:140704275689088][base.py:80 todd.EpochBasedTrainer.strategy_load_model_from load_model_from] INFO: Loading model from /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_2/model.pth\n", + "[2024-03-14 12:18:12,138 62058:140704275689088][base.py:65 todd.EpochBasedTrainer.strategy_load_model_from load_model_state_dict] INFO: \n", + "[2024-03-14 12:18:12,139 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [1/3]\n", + "[2024-03-14 12:18:12,142 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [1/15] batch={'x': tensor([10, 1]), 'y': tensor([20, 2])} weight=0.275 loss=9.488\n", + "[2024-03-14 12:18:12,143 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [2/15] batch={'x': tensor([5, 3]), 'y': tensor([10, 6])} weight=0.303 loss=6.790\n", + "[2024-03-14 12:18:12,145 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [3/15] batch={'x': tensor([9, 6]), 'y': tensor([18, 12])} weight=0.323 loss=12.581\n", + "[2024-03-14 12:18:12,146 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [4/15] batch={'x': tensor([2, 4]), 'y': tensor([4, 8])} weight=0.360 loss=4.920\n", + "[2024-03-14 12:18:12,148 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [5/15] batch={'x': tensor([8, 7]), 'y': tensor([16, 14])} weight=0.375 loss=12.188\n", + "[2024-03-14 12:18:12,149 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_1\n", + "[2024-03-14 12:18:12,153 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [2/3]\n", + "[2024-03-14 12:18:12,154 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [6/15] batch={'x': tensor([8, 1]), 'y': tensor([16, 2])} weight=0.412 loss=7.144\n", + "[2024-03-14 12:18:12,156 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [7/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.435 loss=9.390\n", + "[2024-03-14 12:18:12,157 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [8/15] batch={'x': tensor([10, 4]), 'y': tensor([20, 8])} weight=0.465 loss=10.745\n", + "[2024-03-14 12:18:12,159 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [9/15] batch={'x': tensor([2, 6]), 'y': tensor([ 4, 12])} weight=0.500 loss=6.000\n", + "[2024-03-14 12:18:12,161 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [10/15] batch={'x': tensor([5, 7]), 'y': tensor([10, 14])} weight=0.520 loss=8.880\n", + "[2024-03-14 12:18:12,162 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_2\n", + "[2024-03-14 12:18:12,164 62058:140704275689088][log.py:99 todd.EpochBasedTrainer.strategy_load_model_from before_run_epoch] INFO: Epoch [3/3]\n", + "[2024-03-14 12:18:12,166 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [11/15] batch={'x': tensor([ 6, 10]), 'y': tensor([12, 20])} weight=0.550 loss=11.600\n", + "[2024-03-14 12:18:12,167 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [12/15] batch={'x': tensor([7, 1]), 'y': tensor([14, 2])} weight=0.590 loss=5.640\n", + "[2024-03-14 12:18:12,169 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [13/15] batch={'x': tensor([9, 3]), 'y': tensor([18, 6])} weight=0.610 loss=8.340\n", + "[2024-03-14 12:18:12,170 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [14/15] batch={'x': tensor([4, 2]), 'y': tensor([8, 4])} weight=0.640 loss=4.080\n", + "[2024-03-14 12:18:12,172 62058:140704275689088][log.py:93 todd.EpochBasedTrainer.strategy_load_model_from after_run_iter] INFO: Iter [15/15] batch={'x': tensor([5, 8]), 'y': tensor([10, 16])} weight=0.655 loss=8.743\n", + "[2024-03-14 12:18:12,173 62058:140704275689088][checkpoint.py:80 todd.EpochBasedTrainer.strategy_load_model_from _save] INFO: Saving state dict to /var/folders/v_/1kkfntxs5z74_rwvy1f3_mp80000gn/T/tmpj2udcidm/strategy_load_model_from/checkpoints/epoch_3\n" ] } ],