Skip to content

Commit

Permalink
Merge pull request #178 from macrocosm-os/dev
Browse files Browse the repository at this point in the history
Release 4.5.0
  • Loading branch information
cryptal-mc authored Oct 4, 2024
2 parents 721efa7 + 362f603 commit fde1e89
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 38 deletions.
81 changes: 75 additions & 6 deletions constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
# ---------------------------------

# Release
__version__ = "4.4.0"
__version__ = "4.5.0"

# Validator schema version
__validator_version__ = "3.1.0"
__validator_version__ = "3.2.0"
version_split = __validator_version__.split(".")
__spec_version__ = (
(1000 * int(version_split[0]))
Expand All @@ -64,6 +64,9 @@
# Starting block for 3B, 7B* (epsilon experiment) and sample unpacking
BLOCK_3B_7BSTAR_UNPACK = 3_601_190

# Starting block for activating sample unpacking
BLOCK_SAMPLE_PACK = 4_001_017

# Minimum percent of weight on a vali for a miner to be considered a top miner.
# Since there can be multiple competitions at different reward percentages we can't just check biggest.
WEIGHT_SYNC_MINER_MIN_PERCENT = 0.05
Expand Down Expand Up @@ -95,7 +98,7 @@
DATASET_BY_COMPETITION_ID: Dict[CompetitionId, str] = {
CompetitionId.M772_MODEL: pt.dataset.SubsetFalconLoader,
CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader,
CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader,
}

Expand Down Expand Up @@ -167,6 +170,20 @@
epsilon_func=LinearDecay(0.005, 0.001, 50400),
max_bytes=5 * 1024 * 1024 * 1024,
),
CompetitionId.B3_MODEL: ModelConstraints(
max_model_parameter_size=3_400_000_000,
min_model_parameter_size=3_200_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B7_MODEL: ModelConstraints(
max_model_parameter_size=6_900_000_000,
min_model_parameter_size=6_700_000_000,
Expand All @@ -181,6 +198,34 @@
epsilon_func=LinearDecay(0.005, 0.001, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B14_MODEL: ModelConstraints(
max_model_parameter_size=13_900_000_000,
min_model_parameter_size=13_700_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 100800),
max_bytes=29 * 1024 * 1024 * 1024,
),
}

# Defined model constraints by competition id with decaying epsilon
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2: Dict[CompetitionId, ModelConstraints] = {
CompetitionId.M772_MODEL: ModelConstraints(
max_model_parameter_size=772_000_000,
min_model_parameter_size=572_000_000,
sequence_length=1024,
allowed_architectures=ALLOWED_MODEL_TYPES_1,
tokenizer="distilgpt2",
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.0001, 50400),
max_bytes=5 * 1024 * 1024 * 1024,
),
CompetitionId.B3_MODEL: ModelConstraints(
max_model_parameter_size=3_400_000_000,
min_model_parameter_size=3_200_000_000,
Expand All @@ -192,7 +237,7 @@
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 50400),
epsilon_func=LinearDecay(0.005, 0.0001, 50400),
max_bytes=15 * 1024 * 1024 * 1024,
),
CompetitionId.B14_MODEL: ModelConstraints(
Expand All @@ -206,7 +251,7 @@
"attn_implementation": "flash_attention_2",
},
eval_block_delay=0,
epsilon_func=LinearDecay(0.005, 0.001, 100800),
epsilon_func=LinearDecay(0.005, 0.0001, 100800),
max_bytes=29 * 1024 * 1024 * 1024,
),
}
Expand Down Expand Up @@ -304,6 +349,27 @@
),
],
),
(
BLOCK_SAMPLE_PACK,
[
Competition(
CompetitionId.M772_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.M772_MODEL],
0.14,
),
Competition(
CompetitionId.B3_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.B3_MODEL],
0.29,
),
Competition(
CompetitionId.B14_MODEL,
MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2[CompetitionId.B14_MODEL],
0.57,
),
],
),


]

Expand All @@ -329,6 +395,9 @@
# 0.01 gives ~96% to best model with only ~3 receiving any weights.
temperature = 0.01

# block to activate sample packing
sample_pack_block = BLOCK_SAMPLE_PACK

# validators number of pages to eval over miners on each step.
pages_per_eval_unpack = 5 # With sample unpacking
pages_per_eval_pack = 18
Expand All @@ -339,7 +408,7 @@
sample_min = 5
# Max number of uids that can be either pending eval or currently being evaluated.
# We allow the sample_min per competition + 10 additional models to be held at any one time.
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_LINEAR_DECAY_2) + 10
# time required between updates to the chain.
chain_update_cadence = dt.timedelta(minutes=20)
# Number of blocks required between retrying evaluation of a model.
Expand Down
41 changes: 20 additions & 21 deletions neurons/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ class PerUIDEvalState:
# The losses per batch.
losses: typing.List[float] = dataclasses.field(default=None)

def avg_loss(self) -> float:
"""Safely computes the average loss from a list of losses."""
return sum(self.losses) / len(self.losses) if self.losses else math.inf


class Validator:
MODEL_TRACKER_FILENAME = "model_tracker.pickle"
Expand Down Expand Up @@ -795,11 +799,14 @@ async def run_step(self):
tokenizer = pt.model.load_tokenizer(
competition.constraints, cache_dir=self.config.model_dir
)

pack_samples = False
pages_per_eval = constants.pages_per_eval_unpack



if cur_block >= constants.sample_pack_block:
pack_samples = True
pages_per_eval = constants.pages_per_eval_pack
else:
pack_samples = False
pages_per_eval = constants.pages_per_eval_unpack

# If the option is set in the config, override
pages_per_eval = (
self.config.pages_per_eval
Expand Down Expand Up @@ -905,12 +912,15 @@ async def run_step(self):
)

# Compute wins and win rates per uid.
losses_per_uid = {uid: state.losses for uid, state in uid_to_state.items()}
# Take the average loss across all batches for comparison of best model.
# Keep it as a list of 1 for later calculations.
losses_per_uid = {
uid: [state.avg_loss()] for uid, state in uid_to_state.items()
}
uid_to_block = {uid: state.block for uid, state in uid_to_state.items()}
wins, win_rate = pt.validation.compute_wins(
uids,
losses_per_uid,
batches,
uid_to_block,
competition.constraints.epsilon_func,
cur_block,
Expand Down Expand Up @@ -1038,29 +1048,18 @@ def _record_eval_results(
curr_block (int): The current block.
uid_to_state (typing.Dict[int, PerUIDEvalState]): A dictionary mapping uids to their eval state.
"""
top_model_loss = self._compute_avg_loss(uid_to_state[top_uid].losses)
top_model_loss = uid_to_state[top_uid].avg_loss()
for _, state in uid_to_state.items():
self.model_tracker.on_model_evaluated(
state.hotkey,
EvalResult(
block=curr_block,
score=self._compute_avg_loss(state.losses),
score=state.avg_loss(),
winning_model_block=uid_to_state[top_uid].block,
winning_model_score=top_model_loss,
),
)

def _compute_avg_loss(self, losses: typing.List[float]) -> float:
"""Safely computes the average loss from a list of losses.
Args:
losses (typing.List[float]): A list of losses.
Returns:
float: The average loss.
"""
return sum(losses) / len(losses) if losses else math.inf

def log_step(
self,
competition_id: CompetitionId,
Expand Down Expand Up @@ -1098,7 +1097,7 @@ def log_step(
"block": uid_to_state[uid].block,
"hf": uid_to_state[uid].repo_name,
"competition_id": int(competition_id),
"average_loss": self._compute_avg_loss(uid_to_state[uid].losses),
"average_loss": uid_to_state[uid].avg_loss(),
"epsilon_adv": competition_epsilon_func.compute_epsilon(
current_block, uid_to_state[uid].block
),
Expand Down
22 changes: 11 additions & 11 deletions pretrain/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def iswin(
def compute_wins(
uids: typing.List[int],
losses_per_uid: typing.Dict[int, typing.List[float]],
batches: typing.List[torch.FloatTensor],
uid_to_block: typing.Dict[int, int],
epsilon_func: EpsilonFunc,
current_block: int,
Expand All @@ -78,7 +77,6 @@ def compute_wins(
Parameters:
uids (list): A list of uids to compare.
losses_per_uid (dict): A dictionary of losses for each uid by batch.
batches (List): A list of data batches.
uid_to_block (dict): A dictionary of blocks for each uid.
epsilon_func (EpsilonFunc): Function that determines how much advantage to give to the earlier block.
current_block: The current block.
Expand All @@ -88,20 +86,22 @@ def compute_wins(
"""
wins = {uid: 0 for uid in uids}
win_rate = {uid: 0 for uid in uids}
for i, uid_i in enumerate(uids):
for uid_i in uids:
total_matches = 0
block_i = uid_to_block[uid_i]
for j, uid_j in enumerate(uids):
if i == j:
for uid_j in uids:
if uid_i == uid_j:
continue
block_j = uid_to_block[uid_j]
for batch_idx, _ in enumerate(batches):
loss_i = losses_per_uid[uid_i][batch_idx]
loss_j = losses_per_uid[uid_j][batch_idx]

for loss_i, loss_j in zip(losses_per_uid[uid_i], losses_per_uid[uid_j]):
wins[uid_i] += (
1
if iswin(
loss_i, loss_j, block_i, block_j, epsilon_func, current_block
loss_i,
loss_j,
uid_to_block[uid_i],
uid_to_block[uid_j],
epsilon_func,
current_block,
)
else 0
)
Expand Down

0 comments on commit fde1e89

Please sign in to comment.