Skip to content

Commit

Permalink
Merge pull request #214 from macrocosm-os/dev
Browse files Browse the repository at this point in the history
Release 4.6.3
  • Loading branch information
cryptal-mc authored Dec 12, 2024
2 parents bc90394 + b29ea26 commit 75356e4
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 41 deletions.
27 changes: 23 additions & 4 deletions constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# ---------------------------------

# Release
__version__ = "4.6.2"
__version__ = "4.6.3"

# Validator schema version
__validator_version__ = "4.6.0"
Expand Down Expand Up @@ -173,7 +173,27 @@
epsilon_func=LinearDecay(0.005, 0.0002, 36000),
max_bytes=29 * 1024 * 1024 * 1024,
),

# This constraint is not actually used, it is added as a copy
# of the 14B-model competition constraint entry.
# This is just to keep the size of the constraint dict equal
# to the number of competitions so `update_models_limit` is
# set correctly below.
# This hack will be removed once native support for multi datasets
# is implemented in a future release.
CompetitionId.B14_MODEL_MULTI_DATASET: ModelConstraints(
max_model_parameter_size=13_900_000_000,
min_model_parameter_size=13_700_000_000,
sequence_length=4096,
allowed_architectures=ALLOWED_MODEL_TYPES_2,
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=EVAL_BLOCK_DELAY,
epsilon_func=LinearDecay(0.005, 0.0002, 36000),
max_bytes=29 * 1024 * 1024 * 1024,
),
}

# Schedule of competitions by block.
Expand Down Expand Up @@ -217,7 +237,6 @@
0.4,
),
],

),
]

Expand Down Expand Up @@ -258,7 +277,7 @@
sample_min = 5
# Max number of uids that can be either pending eval or currently being evaluated.
# We allow the sample_min per competition + 10 additional models to be held at any one time.
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10
updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_2) + 10
# time required between updates to the chain.
chain_update_cadence = dt.timedelta(minutes=20)
# Number of blocks required between retrying evaluation of a model.
Expand Down
38 changes: 23 additions & 15 deletions neurons/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import datetime as dt
import functools
import json
import logging
import math
import os
import pickle
Expand All @@ -35,8 +36,11 @@
import traceback
import typing
from collections import defaultdict
from retry import retry

import bittensor as bt
from bittensor.utils.btlogging.helpers import all_loggers
from bittensor.utils.btlogging.defines import BITTENSOR_LOGGER_NAME
import torch
import wandb

Expand Down Expand Up @@ -126,8 +130,16 @@ def state_path(self) -> str:

def __init__(self):
self.config = config.validator_config()
# Manually default to info before overriding with arguments.
# If this is not done then info logging does not work in the cases where other modes are not specified.
bt.logging.set_info()
bt.logging(config=self.config)

# Setting logging level on bittensor messes with all loggers, which we don't want, so set explicitly to warning here.
for logger in all_loggers():
if not logger.name.startswith(BITTENSOR_LOGGER_NAME):
logger.setLevel(logging.WARNING)

bt.logging.info(f"Starting validator with config: {self.config}")

# === Bittensor objects ====
Expand Down Expand Up @@ -172,7 +184,7 @@ def __init__(self):
self._new_wandb_run()

# === Running args ===
self.weights = torch.zeros_like(torch.tensor(self.metagraph.S))
self.weights = torch.zeros_like(torch.from_numpy(self.metagraph.S))
self.global_step = 0
self.last_epoch = self.metagraph.block.item()

Expand Down Expand Up @@ -380,6 +392,9 @@ def update_models(self):
# Track how recently we checked the list of top models.
last_checked_top_models_time = None

# Delay the first update loop until the metagraph has been synced.
time.sleep(60)

# The below loop iterates across all miner uids and checks to see
# if they should be updated.
while not self.stop_event.is_set():
Expand Down Expand Up @@ -713,7 +728,7 @@ async def _try_set_weights():
netuid=self.config.netuid,
wallet=self.wallet,
uids=uids,
weights=self.weights,
weights=self.weights.numpy(),
wait_for_inclusion=False,
version_key=constants.weights_version_key,
)
Expand All @@ -722,15 +737,6 @@ async def _try_set_weights():
except:
bt.logging.warning("Failed to set weights. Trying again later.")

ws, ui = self.weights.topk(len(self.weights))
table = Table(title="All Weights")
table.add_column("uid", justify="right", style="cyan", no_wrap=True)
table.add_column("weight", style="magenta")
for index, weight in list(zip(ui.tolist(), ws.tolist())):
table.add_row(str(index), str(round(weight, 4)))
console = Console()
console.print(table)

try:
bt.logging.debug(f"Setting weights.")
await asyncio.wait_for(_try_set_weights(), ttl)
Expand All @@ -740,8 +746,12 @@ async def _try_set_weights():

def _get_current_block(self) -> int:
"""Returns the current block."""
try:
@retry(tries=5, delay=1, backoff=2)
def _get_block_with_retry():
return self.subtensor.block

try:
return _get_block_with_retry()
except:
bt.logging.debug(
"Failed to get the latest block from the chain. Using the block from the cached metagraph."
Expand Down Expand Up @@ -854,8 +864,6 @@ async def run_step(self):

bt.logging.trace(f"Current block: {cur_block}")



if cur_block < constants.BLOCK_STACK_V2_DEDUP:
dataset_by_competition_id = constants.DATASET_BY_COMPETITION_ID
else:
Expand Down Expand Up @@ -1232,7 +1240,7 @@ def _compute_and_set_competition_weights(

# Fill in metagraph sized tensor with the step weights of the evaluated models.
with self.metagraph_lock:
competition_weights = torch.zeros_like(self.metagraph.S)
competition_weights = torch.zeros_like(torch.from_numpy(self.metagraph.S))

for i, uid_i in enumerate(uids):
competition_weights[uid_i] = step_weights[i]
Expand Down
36 changes: 18 additions & 18 deletions pretrain/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@
from pprint import pprint

import os
from dotenv import load_dotenv
from dotenv import load_dotenv
load_dotenv()

class SubsetLoader(IterableDataset):
"""Base class for data-specific subset loader classes."""

name: str = None # Dataset name
rows_base_url: str = "https://datasets-server.huggingface.co/rows"
size_base_url: str = "https://datasets-server.huggingface.co/size"
max_pages: int = None

def __init__(
self,
batch_size=None,
Expand Down Expand Up @@ -78,9 +78,9 @@ def __init__(
self._initialize_pages()
fetch_attempt += 1

# Exit if the buffer has at least one batch
# Exit if the buffer has at least one batch
if len(self.buffer) >= self.sequence_length:
break
break

bt.logging.warning(
f"All fetched pages seem to be empty or have an extremely low token count. "
Expand Down Expand Up @@ -139,14 +139,14 @@ def _fetch_data_for_page(self, page):
})
else:
self.params["offset"] = page

self.params["length"] = self.num_rows_per_page

attempt = 0
while attempt < self.retry_limit:
try:
response = requests.get(
self.rows_base_url,
self.rows_base_url,
params=self.params,
headers=self._get_request_headers()
)
Expand Down Expand Up @@ -183,9 +183,9 @@ def get_page_names(self):
"""Get page names in consistent format"""
if not hasattr(self, 'pages'):
return []

if isinstance(self.pages[0], tuple):
return [f"{cfg_name}_{num_rows}_{split}"
return [f"{cfg_name}_{num_rows}_{split}"
for cfg_name, num_rows, split in self.pages]
return self.pages

Expand Down Expand Up @@ -257,15 +257,15 @@ def __init__(self, **kwargs):
aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"])

self.s3_sess = session.client("s3")

super().__init__(requires_auth=True, **kwargs)


def _download_row_content(self, blob_id, src_encoding):
"""Download the row content from S3.
"""
s3_url = f"s3://softwareheritage/content/{blob_id}"

s3_url = f"https://softwareheritage.s3.amazonaws.com/content/{blob_id}"

with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": self.s3_sess}) as fin:
content = fin.read().decode(src_encoding)
Expand All @@ -277,7 +277,7 @@ def _get_content_from_row(self, row):

content = self._download_row_content(row['row']['blob_id'], row['row']['src_encoding'])
return content


class SubsetFalconLoader(SubsetLoader):
max_pages: int = 968000015
Expand All @@ -286,14 +286,14 @@ class SubsetFalconLoader(SubsetLoader):

class SubsetFineWebEdu2Loader(SubsetLoader):
name: str = "HuggingFaceFW/fineweb-edu-score-2"

def fetch_dataset_configs(self) -> typing.Dict[str, typing.Dict]:
"""
Fetch dataset configs and their metadata.
Returns a dictionary with config names as keys and metadata as values.
"""
params = dict(dataset=self.name)

attempt = 0
while attempt < self.retry_limit:
try:
Expand Down Expand Up @@ -385,7 +385,7 @@ def get_random_pages(self, num_pages, initial_offset):
split = self.configs_data[config_name]["split"]
pages.append((config_name, selected_page_start, split))
return pages

def fetch_data_to_rows(self, num_pages):
"""Fetch data and return raw text rows instead of adding to buffer."""
downloaded_pages = set()
Expand Down
7 changes: 3 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
torch==2.4.1
bittensor==6.9.4
bittensor==8.4.3
huggingface-hub==0.25.2
matplotlib==3.9.2
pydantic==1.10
python-dotenv==1.0.1
rich==13.9.2
safetensors==0.4.5
numpy==2.1.2
numpy==2.0.1
transformers==4.44.1
wandb==0.18.3
datasets==3.0.1
flash-attn==2.6.3
smart-open[s3]==7.0.5
taoverse==1.0.9
taoverse==1.3.1

0 comments on commit 75356e4

Please sign in to comment.