Merge pull request #198 from macrocosm-os/dev

Release 4.6.0
macrocosm-os · Nov 8, 2024 · 69458c0 · 69458c0
2 parents dd068a2 + 6ddb779
commit 69458c0
Show file tree

Hide file tree

Showing 12 changed files with 976 additions and 717 deletions.
diff --git a/competitions/data.py b/competitions/data.py
@@ -14,6 +14,8 @@ class CompetitionId(IntEnum):
 
     B14_MODEL = 4
 
+    B14_MODEL_MULTI_DATASET = 5
+
     # Overwrite the default __repr__, which doesn't work with
     # bt.logging for some unknown reason.
     def __repr__(self) -> str:

diff --git a/constants/__init__.py b/constants/__init__.py
@@ -34,10 +34,10 @@
 # ---------------------------------
 
 # Release
-__version__ = "4.5.3"
+__version__ = "4.6.0"
 
 # Validator schema version
-__validator_version__ = "3.4.0"
+__validator_version__ = "4.6.0"
 version_split = __validator_version__.split(".")
 __spec_version__ = (
     (1000 * int(version_split[0]))
@@ -64,6 +64,9 @@
 # Starting block for activating sample unpacking
 BLOCK_SAMPLE_PACK = 4_001_017
 
+# Starting block for 14B* (multi dataset experiment).
+BLOCK_14B_STAR = 4_252_646
+
 # Minimum percent of weight on a vali for a miner to be considered a top miner.
 # Since there can be multiple competitions at different reward percentages we can't just check biggest.
 WEIGHT_SYNC_MINER_MIN_PERCENT = 0.05
@@ -97,76 +100,25 @@
     CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader,
     CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
     CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
+    # B14 model multi dataset adds the following dataset to the baseline b14 competition.
+    CompetitionId.B14_MODEL_MULTI_DATASET: pt.dataset.SubsetStackV1DedupLoader,
 }
 
-# Defined model constraints by competition id to ensure they are constant across blocks.
-MODEL_CONSTRAINTS_BY_COMPETITION_ID: Dict[CompetitionId, ModelConstraints] = {
-    CompetitionId.M772_MODEL: ModelConstraints(
-        max_model_parameter_size=772_000_000,
-        min_model_parameter_size=572_000_000,
-        sequence_length=1024,
-        allowed_architectures=ALLOWED_MODEL_TYPES_1,
-        tokenizer="distilgpt2",
-        eval_block_delay=0,
-        epsilon_func=FixedEpsilon(0.005),
-        max_bytes=5 * 1024 * 1024 * 1024,
-    ),
-    CompetitionId.B7_MODEL: ModelConstraints(
-        max_model_parameter_size=6_900_000_000,
-        min_model_parameter_size=6_700_000_000,
-        sequence_length=4096,
-        allowed_architectures=ALLOWED_MODEL_TYPES_2,
-        tokenizer="Xenova/gpt-4",
-        kwargs={
-            "torch_dtype": torch.bfloat16,
-            "attn_implementation": "flash_attention_2",
-        },
-        eval_block_delay=0,
-        epsilon_func=FixedEpsilon(0.005),
-        max_bytes=15 * 1024 * 1024 * 1024,
-    ),
-    CompetitionId.B3_MODEL: ModelConstraints(
-        max_model_parameter_size=3_400_000_000,
-        min_model_parameter_size=3_200_000_000,
-        sequence_length=4096,
-        allowed_architectures=ALLOWED_MODEL_TYPES_2,
-        tokenizer="Xenova/gpt-4",
-        kwargs={
-            "torch_dtype": torch.bfloat16,
-            "attn_implementation": "flash_attention_2",
-        },
-        eval_block_delay=0,
-        epsilon_func=FixedEpsilon(0.005),
-        max_bytes=15 * 1024 * 1024 * 1024,
-    ),
-    CompetitionId.B14_MODEL: ModelConstraints(
-        max_model_parameter_size=13_900_000_000,
-        min_model_parameter_size=13_700_000_000,
-        sequence_length=4096,
-        allowed_architectures=ALLOWED_MODEL_TYPES_2,
-        tokenizer="Xenova/gpt-4",
-        kwargs={
-            "torch_dtype": torch.bfloat16,
-            "attn_implementation": "flash_attention_2",
-        },
-        eval_block_delay=0,
-        epsilon_func=FixedEpsilon(0.005),
-        max_bytes=29 * 1024 * 1024 * 1024,
-    ),
-}
+# Synchronize on blocks roughly every 30 minutes.
+SYNC_BLOCK_CADENCE = 150
+# Delay at least as long as the sync block cadence with an additional buffer.
+EVAL_BLOCK_DELAY = SYNC_BLOCK_CADENCE + 100
 
 # Defined model constraints by competition id with decaying epsilon
-MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY: Dict[
-    CompetitionId, ModelConstraints
-] = {
+MODEL_CONSTRAINTS_BY_COMPETITION_ID: Dict[CompetitionId, ModelConstraints] = {
     CompetitionId.M772_MODEL: ModelConstraints(
         max_model_parameter_size=772_000_000,
         min_model_parameter_size=572_000_000,
         sequence_length=1024,
         allowed_architectures=ALLOWED_MODEL_TYPES_1,
         tokenizer="distilgpt2",
-        eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        eval_block_delay=EVAL_BLOCK_DELAY,
+        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
         max_bytes=5 * 1024 * 1024 * 1024,
     ),
     CompetitionId.B3_MODEL: ModelConstraints(
@@ -179,22 +131,8 @@
             "torch_dtype": torch.bfloat16,
             "attn_implementation": "flash_attention_2",
         },
-        eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 50400),
-        max_bytes=15 * 1024 * 1024 * 1024,
-    ),
-    CompetitionId.B7_MODEL: ModelConstraints(
-        max_model_parameter_size=6_900_000_000,
-        min_model_parameter_size=6_700_000_000,
-        sequence_length=4096,
-        allowed_architectures=ALLOWED_MODEL_TYPES_2,
-        tokenizer="Xenova/gpt-4",
-        kwargs={
-            "torch_dtype": torch.bfloat16,
-            "attn_implementation": "flash_attention_2",
-        },
-        eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 50400),
+        eval_block_delay=EVAL_BLOCK_DELAY,
+        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
         max_bytes=15 * 1024 * 1024 * 1024,
     ),
     CompetitionId.B14_MODEL: ModelConstraints(
@@ -207,24 +145,21 @@
             "torch_dtype": torch.bfloat16,
             "attn_implementation": "flash_attention_2",
         },
-        eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.001, 100800),
+        eval_block_delay=EVAL_BLOCK_DELAY,
+        epsilon_func=LinearDecay(0.005, 0.0001, 72000),
         max_bytes=29 * 1024 * 1024 * 1024,
     ),
 }
 
-# Defined model constraints by competition id with decaying epsilon
-MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2: Dict[
-    CompetitionId, ModelConstraints
-] = {
+MODEL_CONSTRAINTS_BY_COMPETITION_ID_2: Dict[CompetitionId, ModelConstraints] = {
     CompetitionId.M772_MODEL: ModelConstraints(
         max_model_parameter_size=772_000_000,
         min_model_parameter_size=572_000_000,
         sequence_length=1024,
         allowed_architectures=ALLOWED_MODEL_TYPES_1,
         tokenizer="distilgpt2",
-        eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
+        eval_block_delay=EVAL_BLOCK_DELAY,
+        epsilon_func=LinearDecay(0.005, 0.0005, 50400),
         max_bytes=5 * 1024 * 1024 * 1024,
     ),
     CompetitionId.B3_MODEL: ModelConstraints(
@@ -237,8 +172,8 @@
             "torch_dtype": torch.bfloat16,
             "attn_implementation": "flash_attention_2",
         },
-        eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.0001, 50400),
+        eval_block_delay=EVAL_BLOCK_DELAY,
+        epsilon_func=LinearDecay(0.005, 0.0005, 50400),
         max_bytes=15 * 1024 * 1024 * 1024,
     ),
     CompetitionId.B14_MODEL: ModelConstraints(
@@ -251,42 +186,16 @@
             "torch_dtype": torch.bfloat16,
             "attn_implementation": "flash_attention_2",
         },
-        eval_block_delay=0,
-        epsilon_func=LinearDecay(0.005, 0.0001, 72000),
+        eval_block_delay=EVAL_BLOCK_DELAY,
+        epsilon_func=LinearDecay(0.005, 0.0005, 50400),
         max_bytes=29 * 1024 * 1024 * 1024,
     ),
 }
 
-
 # Schedule of competitions by block.
 COMPETITION_SCHEDULE_BY_BLOCK: List[Tuple[int, List[Competition]]] = [
     (
         0,
-        [
-            Competition(
-                CompetitionId.B7_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
-                1.0,
-            )
-        ],
-    ),
-    (
-        3_565_190,
-        [
-            Competition(
-                CompetitionId.M772_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.M772_MODEL],
-                0.35,
-            ),
-            Competition(
-                CompetitionId.B7_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
-                0.65,
-            ),
-        ],
-    ),
-    (
-        BLOCK_3B_7BSTAR_UNPACK,
         [
             Competition(
                 CompetitionId.M772_MODEL,
@@ -298,96 +207,31 @@
                 MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B3_MODEL],
                 0.29,
             ),
-            Competition(
-                CompetitionId.B7_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
-                0.57,
-            ),
-        ],
-    ),
-    (
-        3_750_683,
-        [
-            Competition(
-                CompetitionId.M772_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
-                    CompetitionId.M772_MODEL
-                ],
-                0.14,
-            ),
-            Competition(
-                CompetitionId.B3_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
-                    CompetitionId.B3_MODEL
-                ],
-                0.29,
-            ),
-            Competition(
-                CompetitionId.B7_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
-                    CompetitionId.B7_MODEL
-                ],
-                0.15,
-            ),
             Competition(
                 CompetitionId.B14_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
-                    CompetitionId.B14_MODEL
-                ],
-                0.42,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL],
+                0.57,
             ),
         ],
     ),
     (
-        3_849_722,
+        BLOCK_14B_STAR,
         [
-            Competition(
-                CompetitionId.M772_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
-                    CompetitionId.M772_MODEL
-                ],
-                0.14,
-            ),
             Competition(
                 CompetitionId.B3_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
-                    CompetitionId.B3_MODEL
-                ],
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_2[CompetitionId.B3_MODEL],
                 0.29,
             ),
             Competition(
                 CompetitionId.B14_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
-                    CompetitionId.B14_MODEL
-                ],
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_2[CompetitionId.B14_MODEL],
                 0.57,
             ),
-        ],
-    ),
-    (
-        BLOCK_SAMPLE_PACK,
-        [
             Competition(
-                CompetitionId.M772_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[
-                    CompetitionId.M772_MODEL
-                ],
+                CompetitionId.B14_MODEL_MULTI_DATASET,
+                MODEL_CONSTRAINTS_BY_COMPETITION_ID_2[CompetitionId.B14_MODEL],
                 0.14,
             ),
-            Competition(
-                CompetitionId.B3_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[
-                    CompetitionId.B3_MODEL
-                ],
-                0.29,
-            ),
-            Competition(
-                CompetitionId.B14_MODEL,
-                MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[
-                    CompetitionId.B14_MODEL
-                ],
-                0.57,
-            ),
         ],
     ),
 ]
@@ -418,18 +262,20 @@
 sample_pack_block = BLOCK_SAMPLE_PACK
 
 # validators number of pages to eval over miners on each step.
-pages_per_eval_unpack = 5  # With sample unpacking
-pages_per_eval_pack = 11
+pages_per_eval_unpack = 10  # With sample unpacking
+pages_per_eval_pack = 22
+
+# In a future release we will update the loaders to be able to load a certain number of tokens rather than pages.
+# Until then we need to set this manually
+pages_per_eval_14bstar_pack = 1
 
 # validator eval batch size.
 batch_size = 1
 # validator eval batch min to keep for next loop.
 sample_min = 5
 # Max number of uids that can be either pending eval or currently being evaluated.
 # We allow the sample_min per competition + 10 additional models to be held at any one time.
-updated_models_limit = (
-    sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2) + 10
-)
+updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
 # time required between updates to the chain.
 chain_update_cadence = dt.timedelta(minutes=20)
 # Number of blocks required between retrying evaluation of a model.

diff --git a/docs/miner.md b/docs/miner.md
@@ -125,6 +125,8 @@ You can manually upload with the following command:
 python scripts/upload_model.py --load_model_dir <path to model> --competition_id 0 --hf_repo_id my-username/my-project --wallet.name coldkey --wallet.hotkey hotkey
 ```
 
+Note: We recommend keeping your hugging face repo private until after you have committed your metadata to the chain. This ensures other miners are unable to upload your model as their own until a later block. Adding the `--update_repo_visibility` flag will also automatically attempt to update the hugging face repo visibility to public after committing to the chain.
+
 Note: If you are not sure about the competition ID, you can add the `--list_competitions` flag to get a list of all competitions. You can also check out competition IDs in [competitions/data.py](https://github.com/macrocosm-os/pretraining/blob/main/competitions/data.py).
 
 ## Running a custom Miner