Skip to content

Commit

Permalink
Merge pull request #198 from macrocosm-os/dev
Browse files Browse the repository at this point in the history
Release 4.6.0
  • Loading branch information
cryptal-mc authored Nov 8, 2024
2 parents dd068a2 + 6ddb779 commit 69458c0
Show file tree
Hide file tree
Showing 12 changed files with 976 additions and 717 deletions.
2 changes: 2 additions & 0 deletions competitions/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class CompetitionId(IntEnum):

B14_MODEL = 4

B14_MODEL_MULTI_DATASET = 5

# Overwrite the default __repr__, which doesn't work with
# bt.logging for some unknown reason.
def __repr__(self) -> str:
Expand Down
232 changes: 39 additions & 193 deletions constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@
# ---------------------------------

# Release
__version__ = "4.5.3"
__version__ = "4.6.0"

# Validator schema version
__validator_version__ = "3.4.0"
__validator_version__ = "4.6.0"
version_split = __validator_version__.split(".")
__spec_version__ = (
(1000 * int(version_split[0]))
Expand All @@ -64,6 +64,9 @@
# Starting block for activating sample unpacking
BLOCK_SAMPLE_PACK = 4_001_017

# Starting block for 14B* (multi dataset experiment).
BLOCK_14B_STAR = 4_252_646

# Minimum percent of weight on a vali for a miner to be considered a top miner.
# Since there can be multiple competitions at different reward percentages we can't just check biggest.
WEIGHT_SYNC_MINER_MIN_PERCENT = 0.05
Expand Down Expand Up @@ -97,76 +100,25 @@
CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader,
CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
# B14 model multi dataset adds the following dataset to the baseline b14 competition.
CompetitionId.B14_MODEL_MULTI_DATASET: pt.dataset.SubsetStackV1DedupLoader,
}

# Defined model constraints by competition id to ensure they are constant across blocks.
MODEL_CONSTRAINTS_BY_COMPETITION_ID: Dict[CompetitionId, ModelConstraints] = {
CompetitionId.M772_MODEL: ModelConstraints(
max_model_parameter_size=772_000_000,
min_model_parameter_size=572_000_000,
sequence_length=1024,
allowed_architectures=ALLOWED_MODEL_TYPES_1,
tokenizer="distilgpt2",
eval_block_delay=0,
epsilon_func=FixedEpsilon(0.005),
max_bytes=5 * 1024 * 1024 * 1024,
),
CompetitionId.B7_MODEL: ModelConstraints(
max_model_parameter_size=6_900_000_000,
min_model_parameter_size=6_700_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=FixedEpsilon(0.005),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B3_MODEL: ModelConstraints(
max_model_parameter_size=3_400_000_000,
min_model_parameter_size=3_200_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=FixedEpsilon(0.005),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B14_MODEL: ModelConstraints(
max_model_parameter_size=13_900_000_000,
min_model_parameter_size=13_700_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=FixedEpsilon(0.005),
max_bytes=29 * 1024 * 1024 * 1024,
),
}
# Synchronize on blocks roughly every 30 minutes.
SYNC_BLOCK_CADENCE = 150
# Delay at least as long as the sync block cadence with an additional buffer.
EVAL_BLOCK_DELAY = SYNC_BLOCK_CADENCE + 100

# Defined model constraints by competition id with decaying epsilon
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY: Dict[
CompetitionId, ModelConstraints
] = {
MODEL_CONSTRAINTS_BY_COMPETITION_ID: Dict[CompetitionId, ModelConstraints] = {
CompetitionId.M772_MODEL: ModelConstraints(
max_model_parameter_size=772_000_000,
min_model_parameter_size=572_000_000,
sequence_length=1024,
allowed_architectures=ALLOWED_MODEL_TYPES_1,
tokenizer="distilgpt2",
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
eval_block_delay=EVAL_BLOCK_DELAY,
epsilon_func=LinearDecay(0.005, 0.0001, 50400),
max_bytes=5 * 1024 * 1024 * 1024,
),
CompetitionId.B3_MODEL: ModelConstraints(
Expand All @@ -179,22 +131,8 @@
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B7_MODEL: ModelConstraints(
max_model_parameter_size=6_900_000_000,
min_model_parameter_size=6_700_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
eval_block_delay=EVAL_BLOCK_DELAY,
epsilon_func=LinearDecay(0.005, 0.0001, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B14_MODEL: ModelConstraints(
Expand All @@ -207,24 +145,21 @@
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 100800),
eval_block_delay=EVAL_BLOCK_DELAY,
epsilon_func=LinearDecay(0.005, 0.0001, 72000),
max_bytes=29 * 1024 * 1024 * 1024,
),
}

# Defined model constraints by competition id with decaying epsilon
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2: Dict[
CompetitionId, ModelConstraints
] = {
MODEL_CONSTRAINTS_BY_COMPETITION_ID_2: Dict[CompetitionId, ModelConstraints] = {
CompetitionId.M772_MODEL: ModelConstraints(
max_model_parameter_size=772_000_000,
min_model_parameter_size=572_000_000,
sequence_length=1024,
allowed_architectures=ALLOWED_MODEL_TYPES_1,
tokenizer="distilgpt2",
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.0001, 50400),
eval_block_delay=EVAL_BLOCK_DELAY,
epsilon_func=LinearDecay(0.005, 0.0005, 50400),
max_bytes=5 * 1024 * 1024 * 1024,
),
CompetitionId.B3_MODEL: ModelConstraints(
Expand All @@ -237,8 +172,8 @@
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.0001, 50400),
eval_block_delay=EVAL_BLOCK_DELAY,
epsilon_func=LinearDecay(0.005, 0.0005, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B14_MODEL: ModelConstraints(
Expand All @@ -251,42 +186,16 @@
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.0001, 72000),
eval_block_delay=EVAL_BLOCK_DELAY,
epsilon_func=LinearDecay(0.005, 0.0005, 50400),
max_bytes=29 * 1024 * 1024 * 1024,
),
}


# Schedule of competitions by block.
COMPETITION_SCHEDULE_BY_BLOCK: List[Tuple[int, List[Competition]]] = [
(
0,
[
Competition(
CompetitionId.B7_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
1.0,
)
],
),
(
3_565_190,
[
Competition(
CompetitionId.M772_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.M772_MODEL],
0.35,
),
Competition(
CompetitionId.B7_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
0.65,
),
],
),
(
BLOCK_3B_7BSTAR_UNPACK,
[
Competition(
CompetitionId.M772_MODEL,
Expand All @@ -298,96 +207,31 @@
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B3_MODEL],
0.29,
),
Competition(
CompetitionId.B7_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL],
0.57,
),
],
),
(
3_750_683,
[
Competition(
CompetitionId.M772_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
CompetitionId.M772_MODEL
],
0.14,
),
Competition(
CompetitionId.B3_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
CompetitionId.B3_MODEL
],
0.29,
),
Competition(
CompetitionId.B7_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
CompetitionId.B7_MODEL
],
0.15,
),
Competition(
CompetitionId.B14_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
CompetitionId.B14_MODEL
],
0.42,
MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL],
0.57,
),
],
),
(
3_849_722,
BLOCK_14B_STAR,
[
Competition(
CompetitionId.M772_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
CompetitionId.M772_MODEL
],
0.14,
),
Competition(
CompetitionId.B3_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
CompetitionId.B3_MODEL
],
MODEL_CONSTRAINTS_BY_COMPETITION_ID_2[CompetitionId.B3_MODEL],
0.29,
),
Competition(
CompetitionId.B14_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY[
CompetitionId.B14_MODEL
],
MODEL_CONSTRAINTS_BY_COMPETITION_ID_2[CompetitionId.B14_MODEL],
0.57,
),
],
),
(
BLOCK_SAMPLE_PACK,
[
Competition(
CompetitionId.M772_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[
CompetitionId.M772_MODEL
],
CompetitionId.B14_MODEL_MULTI_DATASET,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_2[CompetitionId.B14_MODEL],
0.14,
),
Competition(
CompetitionId.B3_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[
CompetitionId.B3_MODEL
],
0.29,
),
Competition(
CompetitionId.B14_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[
CompetitionId.B14_MODEL
],
0.57,
),
],
),
]
Expand Down Expand Up @@ -418,18 +262,20 @@
sample_pack_block = BLOCK_SAMPLE_PACK

# validators number of pages to eval over miners on each step.
pages_per_eval_unpack = 5 # With sample unpacking
pages_per_eval_pack = 11
pages_per_eval_unpack = 10 # With sample unpacking
pages_per_eval_pack = 22

# In a future release we will update the loaders to be able to load a certain number of tokens rather than pages.
# Until then we need to set this manually
pages_per_eval_14bstar_pack = 1

# validator eval batch size.
batch_size = 1
# validator eval batch min to keep for next loop.
sample_min = 5
# Max number of uids that can be either pending eval or currently being evaluated.
# We allow the sample_min per competition + 10 additional models to be held at any one time.
updated_models_limit = (
sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2) + 10
)
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
# time required between updates to the chain.
chain_update_cadence = dt.timedelta(minutes=20)
# Number of blocks required between retrying evaluation of a model.
Expand Down
2 changes: 2 additions & 0 deletions docs/miner.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ You can manually upload with the following command:
python scripts/upload_model.py --load_model_dir <path to model> --competition_id 0 --hf_repo_id my-username/my-project --wallet.name coldkey --wallet.hotkey hotkey
```

Note: We recommend keeping your hugging face repo private until after you have committed your metadata to the chain. This ensures other miners are unable to upload your model as their own until a later block. Adding the `--update_repo_visibility` flag will also automatically attempt to update the hugging face repo visibility to public after committing to the chain.

Note: If you are not sure about the competition ID, you can add the `--list_competitions` flag to get a list of all competitions. You can also check out competition IDs in [competitions/data.py](https://github.com/macrocosm-os/pretraining/blob/main/competitions/data.py).

## Running a custom Miner
Expand Down
Loading

0 comments on commit 69458c0

Please sign in to comment.