Merge pull request #178 from macrocosm-os/dev

Release 4.5.0
macrocosm-os · Oct 4, 2024 · fde1e89 · fde1e89
2 parents 721efa7 + 362f603
commit fde1e89
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 38 deletions.
diff --git a/constants/__init__.py b/constants/__init__.py
@@ -37,10 +37,10 @@
 # ---------------------------------
 
 # Release
-__version__ = "4.4.0"
+__version__ = "4.5.0"
 
 # Validator schema version
-__validator_version__ = "3.1.0"
+__validator_version__ = "3.2.0"
 version_split = __validator_version__.split(".")
 __spec_version__ = (
     (1000 * int(version_split[0]))
@@ -64,6 +64,9 @@
 # Starting block for 3B, 7B* (epsilon experiment) and sample unpacking
 BLOCK_3B_7BSTAR_UNPACK = 3_601_190
 
+# Starting block for activating sample unpacking
+BLOCK_SAMPLE_PACK = 4_001_017
+
 # Minimum percent of weight on a vali for a miner to be considered a top miner.
 # Since there can be multiple competitions at different reward percentages we can't just check biggest.
 WEIGHT_SYNC_MINER_MIN_PERCENT = 0.05
@@ -95,7 +98,7 @@
 DATASET_BY_COMPETITION_ID: Dict[CompetitionId, str] = {
     CompetitionId.M772_MODEL: pt.dataset.SubsetFalconLoader,
     CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader,
-    CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
+    CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,    
     CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
 }
 
@@ -167,6 +170,20 @@
         epsilon_func=LinearDecay(0.005, 0.001, 50400),
         max_bytes=5 * 1024 * 1024 * 1024,
     ),
+    CompetitionId.B3_MODEL: ModelConstraints(
+        max_model_parameter_size=3_400_000_000,
+        min_model_parameter_size=3_200_000_000,
+        sequence_length=4096,
+        allowed_architectures=ALLOWED_MODEL_TYPES_2,
+        tokenizer="Xenova/gpt-4",
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "flash_attention_2",
+        },
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        max_bytes=15 * 1024 * 1024 * 1024,
+    ),
     CompetitionId.B7_MODEL: ModelConstraints(
         max_model_parameter_size=6_900_000_000,
         min_model_parameter_size=6_700_000_000,
@@ -181,6 +198,34 @@
         epsilon_func=LinearDecay(0.005, 0.001, 50400),
         max_bytes=15 * 1024 * 1024 * 1024,
     ),
+    CompetitionId.B14_MODEL: ModelConstraints(
+        max_model_parameter_size=13_900_000_000,
+        min_model_parameter_size=13_700_000_000,
+        sequence_length=4096,
+        allowed_architectures=ALLOWED_MODEL_TYPES_2,
+        tokenizer="Xenova/gpt-4",
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "flash_attention_2",
+        },
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 100800),
+        max_bytes=29 * 1024 * 1024 * 1024,
+    ),
+}
+
+# Defined model constraints by competition id with decaying epsilon
+MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2: Dict[CompetitionId, ModelConstraints] = {
+    CompetitionId.M772_MODEL: ModelConstraints(
+        max_model_parameter_size=772_000_000,
+        min_model_parameter_size=572_000_000,
+        sequence_length=1024,
+        allowed_architectures=ALLOWED_MODEL_TYPES_1,
+        tokenizer="distilgpt2",
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
+        max_bytes=5 * 1024 * 1024 * 1024,
+    ),
     CompetitionId.B3_MODEL: ModelConstraints(
         max_model_parameter_size=3_400_000_000,
         min_model_parameter_size=3_200_000_000,
@@ -192,7 +237,7 @@
             "attn_implementation": "flash_attention_2",
         },
         eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
         max_bytes=15 * 1024 * 1024 * 1024,
     ),
     CompetitionId.B14_MODEL: ModelConstraints(
@@ -206,7 +251,7 @@
             "attn_implementation": "flash_attention_2",
         },
         eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 100800),
+        epsilon_func=LinearDecay(0.005, 0.0001, 100800),
         max_bytes=29 * 1024 * 1024 * 1024,
     ),
 }
@@ -304,6 +349,27 @@
             ),
         ],
     ),
+    (
+        BLOCK_SAMPLE_PACK,
+        [
+            Competition(
+                CompetitionId.M772_MODEL,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.M772_MODEL],
+                0.14,
+            ),
+            Competition(
+                CompetitionId.B3_MODEL,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.B3_MODEL],
+                0.29,
+            ),
+            Competition(
+                CompetitionId.B14_MODEL,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.B14_MODEL],
+                0.57,
+            ),
+        ],
+    ),
+
 
 ]
 
@@ -329,6 +395,9 @@
 # 0.01 gives ~96% to best model with only ~3 receiving any weights.
 temperature = 0.01
 
+# block to activate sample packing
+sample_pack_block = BLOCK_SAMPLE_PACK
+
 # validators number of pages to eval over miners on each step.
 pages_per_eval_unpack = 5  # With sample unpacking
 pages_per_eval_pack = 18
@@ -339,7 +408,7 @@
 sample_min = 5
 # Max number of uids that can be either pending eval or currently being evaluated.
 # We allow the sample_min per competition + 10 additional models to be held at any one time.
-updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
+updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2) + 10
 # time required between updates to the chain.
 chain_update_cadence = dt.timedelta(minutes=20)
 # Number of blocks required between retrying evaluation of a model.

diff --git a/neurons/validator.py b/neurons/validator.py
@@ -89,6 +89,10 @@ class PerUIDEvalState:
     # The losses per batch.
     losses: typing.List[float] = dataclasses.field(default=None)
 
+    def avg_loss(self) -> float:
+        """Safely computes the average loss from a list of losses."""
+        return sum(self.losses) / len(self.losses) if self.losses else math.inf
+
 
 class Validator:
     MODEL_TRACKER_FILENAME = "model_tracker.pickle"
@@ -795,11 +799,14 @@ async def run_step(self):
         tokenizer = pt.model.load_tokenizer(
             competition.constraints, cache_dir=self.config.model_dir
         )
-
-        pack_samples = False
-        pages_per_eval = constants.pages_per_eval_unpack
-
-
+
+        if cur_block >= constants.sample_pack_block:
+            pack_samples = True
+            pages_per_eval = constants.pages_per_eval_pack
+        else:
+            pack_samples = False
+            pages_per_eval = constants.pages_per_eval_unpack
+
         # If the option is set in the config, override
         pages_per_eval = (
             self.config.pages_per_eval
@@ -905,12 +912,15 @@ async def run_step(self):
             )
 
         # Compute wins and win rates per uid.
-        losses_per_uid = {uid: state.losses for uid, state in uid_to_state.items()}
+        # Take the average loss across all batches for comparison of best model.
+        # Keep it as a list of 1 for later calculations.
+        losses_per_uid = {
+            uid: [state.avg_loss()] for uid, state in uid_to_state.items()
+        }
         uid_to_block = {uid: state.block for uid, state in uid_to_state.items()}
         wins, win_rate = pt.validation.compute_wins(
             uids,
             losses_per_uid,
-            batches,
             uid_to_block,
             competition.constraints.epsilon_func,
             cur_block,
@@ -1038,29 +1048,18 @@ def _record_eval_results(
             curr_block (int): The current block.
             uid_to_state (typing.Dict[int, PerUIDEvalState]): A dictionary mapping uids to their eval state.
         """
-        top_model_loss = self._compute_avg_loss(uid_to_state[top_uid].losses)
+        top_model_loss = uid_to_state[top_uid].avg_loss()
         for _, state in uid_to_state.items():
             self.model_tracker.on_model_evaluated(
                 state.hotkey,
                 EvalResult(
                     block=curr_block,
-                    score=self._compute_avg_loss(state.losses),
+                    score=state.avg_loss(),
                     winning_model_block=uid_to_state[top_uid].block,
                     winning_model_score=top_model_loss,
                 ),
             )
 
-    def _compute_avg_loss(self, losses: typing.List[float]) -> float:
-        """Safely computes the average loss from a list of losses.
-
-        Args:
-            losses (typing.List[float]): A list of losses.
-
-        Returns:
-            float: The average loss.
-        """
-        return sum(losses) / len(losses) if losses else math.inf
-
     def log_step(
         self,
         competition_id: CompetitionId,
@@ -1098,7 +1097,7 @@ def log_step(
                 "block": uid_to_state[uid].block,
                 "hf": uid_to_state[uid].repo_name,
                 "competition_id": int(competition_id),
-                "average_loss": self._compute_avg_loss(uid_to_state[uid].losses),
+                "average_loss": uid_to_state[uid].avg_loss(),
                 "epsilon_adv": competition_epsilon_func.compute_epsilon(
                     current_block, uid_to_state[uid].block
                 ),

diff --git a/pretrain/validation.py b/pretrain/validation.py
@@ -67,7 +67,6 @@ def iswin(
 def compute_wins(
     uids: typing.List[int],
     losses_per_uid: typing.Dict[int, typing.List[float]],
-    batches: typing.List[torch.FloatTensor],
     uid_to_block: typing.Dict[int, int],
     epsilon_func: EpsilonFunc,
     current_block: int,
@@ -78,7 +77,6 @@ def compute_wins(
     Parameters:
         uids (list): A list of uids to compare.
         losses_per_uid (dict): A dictionary of losses for each uid by batch.
-        batches (List): A list of data batches.
         uid_to_block (dict): A dictionary of blocks for each uid.
         epsilon_func (EpsilonFunc): Function that determines how much advantage to give to the earlier block.
         current_block: The current block.
@@ -88,20 +86,22 @@ def compute_wins(
     """
     wins = {uid: 0 for uid in uids}
     win_rate = {uid: 0 for uid in uids}
-    for i, uid_i in enumerate(uids):
+    for uid_i in uids:
         total_matches = 0
-        block_i = uid_to_block[uid_i]
-        for j, uid_j in enumerate(uids):
-            if i == j:
+        for uid_j in uids:
+            if uid_i == uid_j:
                 continue
-            block_j = uid_to_block[uid_j]
-            for batch_idx, _ in enumerate(batches):
-                loss_i = losses_per_uid[uid_i][batch_idx]
-                loss_j = losses_per_uid[uid_j][batch_idx]
+
+            for loss_i, loss_j in zip(losses_per_uid[uid_i], losses_per_uid[uid_j]):
                 wins[uid_i] += (
                     1
                     if iswin(
-                        loss_i, loss_j, block_i, block_j, epsilon_func, current_block
+                        loss_i,
+                        loss_j,
+                        uid_to_block[uid_i],
+                        uid_to_block[uid_j],
+                        epsilon_func,
+                        current_block,
                     )
                     else 0
                 )