Merge pull request #177 from macrocosm-os/SN9-109-110

Revert to Sample Packing and lower epsilon
macrocosm-os · Oct 4, 2024 · 9b11aee · 9b11aee
2 parents 9d4ab4a + 6639300
commit 9b11aee
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 9 deletions.
diff --git a/constants/__init__.py b/constants/__init__.py
@@ -37,10 +37,10 @@
 # ---------------------------------
 
 # Release
-__version__ = "4.4.0"
+__version__ = "4.5.0"
 
 # Validator schema version
-__validator_version__ = "3.1.0"
+__validator_version__ = "3.2.0"
 version_split = __validator_version__.split(".")
 __spec_version__ = (
     (1000 * int(version_split[0]))
@@ -64,6 +64,9 @@
 # Starting block for 3B, 7B* (epsilon experiment) and sample unpacking
 BLOCK_3B_7BSTAR_UNPACK = 3_601_190
 
+# Starting block for activating sample unpacking
+BLOCK_SAMPLE_PACK = 4_001_017
+
 # Minimum percent of weight on a vali for a miner to be considered a top miner.
 # Since there can be multiple competitions at different reward percentages we can't just check biggest.
 WEIGHT_SYNC_MINER_MIN_PERCENT = 0.05
@@ -95,7 +98,7 @@
 DATASET_BY_COMPETITION_ID: Dict[CompetitionId, str] = {
     CompetitionId.M772_MODEL: pt.dataset.SubsetFalconLoader,
     CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader,
-    CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
+    CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,    
     CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
 }
 
@@ -167,6 +170,20 @@
         epsilon_func=LinearDecay(0.005, 0.001, 50400),
         max_bytes=5 * 1024 * 1024 * 1024,
     ),
+    CompetitionId.B3_MODEL: ModelConstraints(
+        max_model_parameter_size=3_400_000_000,
+        min_model_parameter_size=3_200_000_000,
+        sequence_length=4096,
+        allowed_architectures=ALLOWED_MODEL_TYPES_2,
+        tokenizer="Xenova/gpt-4",
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "flash_attention_2",
+        },
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        max_bytes=15 * 1024 * 1024 * 1024,
+    ),
     CompetitionId.B7_MODEL: ModelConstraints(
         max_model_parameter_size=6_900_000_000,
         min_model_parameter_size=6_700_000_000,
@@ -181,6 +198,34 @@
         epsilon_func=LinearDecay(0.005, 0.001, 50400),
         max_bytes=15 * 1024 * 1024 * 1024,
     ),
+    CompetitionId.B14_MODEL: ModelConstraints(
+        max_model_parameter_size=13_900_000_000,
+        min_model_parameter_size=13_700_000_000,
+        sequence_length=4096,
+        allowed_architectures=ALLOWED_MODEL_TYPES_2,
+        tokenizer="Xenova/gpt-4",
+        kwargs={
+            "torch_dtype": torch.bfloat16,
+            "attn_implementation": "flash_attention_2",
+        },
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.001, 100800),
+        max_bytes=29 * 1024 * 1024 * 1024,
+    ),
+}
+
+# Defined model constraints by competition id with decaying epsilon
+MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2: Dict[CompetitionId, ModelConstraints] = {
+    CompetitionId.M772_MODEL: ModelConstraints(
+        max_model_parameter_size=772_000_000,
+        min_model_parameter_size=572_000_000,
+        sequence_length=1024,
+        allowed_architectures=ALLOWED_MODEL_TYPES_1,
+        tokenizer="distilgpt2",
+        eval_block_delay=0,
+        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
+        max_bytes=5 * 1024 * 1024 * 1024,
+    ),
     CompetitionId.B3_MODEL: ModelConstraints(
         max_model_parameter_size=3_400_000_000,
         min_model_parameter_size=3_200_000_000,
@@ -192,7 +237,7 @@
             "attn_implementation": "flash_attention_2",
         },
         eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
         max_bytes=15 * 1024 * 1024 * 1024,
     ),
     CompetitionId.B14_MODEL: ModelConstraints(
@@ -206,7 +251,7 @@
             "attn_implementation": "flash_attention_2",
         },
         eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 100800),
+        epsilon_func=LinearDecay(0.005, 0.0001, 100800),
         max_bytes=29 * 1024 * 1024 * 1024,
     ),
 }
@@ -304,6 +349,27 @@
             ),
         ],
     ),
+    (
+        BLOCK_SAMPLE_PACK,
+        [
+            Competition(
+                CompetitionId.M772_MODEL,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.M772_MODEL],
+                0.14,
+            ),
+            Competition(
+                CompetitionId.B3_MODEL,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.B3_MODEL],
+                0.29,
+            ),
+            Competition(
+                CompetitionId.B14_MODEL,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.B14_MODEL],
+                0.57,
+            ),
+        ],
+    ),
+
 
 ]
 
@@ -329,6 +395,9 @@
 # 0.01 gives ~96% to best model with only ~3 receiving any weights.
 temperature = 0.01
 
+# block to activate sample packing
+sample_pack_block = BLOCK_SAMPLE_PACK
+
 # validators number of pages to eval over miners on each step.
 pages_per_eval_unpack = 5  # With sample unpacking
 pages_per_eval_pack = 18
@@ -339,7 +408,7 @@
 sample_min = 5
 # Max number of uids that can be either pending eval or currently being evaluated.
 # We allow the sample_min per competition + 10 additional models to be held at any one time.
-updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
+updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2) + 10
 # time required between updates to the chain.
 chain_update_cadence = dt.timedelta(minutes=20)
 # Number of blocks required between retrying evaluation of a model.

diff --git a/neurons/validator.py b/neurons/validator.py
@@ -796,10 +796,14 @@ async def run_step(self):
             competition.constraints, cache_dir=self.config.model_dir
         )
 
-        pack_samples = False
-        pages_per_eval = constants.pages_per_eval_unpack
-
+        if cur_block >= constants.sample_pack_block:
+            pack_samples = True
+            pages_per_eval = constants.pages_per_eval_pack
+        else:
+            pack_samples = False
+            pages_per_eval = constants.pages_per_eval_unpack
 
+
         # If the option is set in the config, override
         pages_per_eval = (
             self.config.pages_per_eval