From 7fa3977ba39df44ff6aba04f3ade53dfd95a4525 Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 19:27:27 -0800 Subject: [PATCH 01/12] Bump Bittensor to 8.4.3. and taoverse to 1.3.0. --- neurons/validator.py | 20 ++++++-------------- requirements.txt | 4 ++-- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index f19e442..b533e40 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -126,6 +126,9 @@ def state_path(self) -> str: def __init__(self): self.config = config.validator_config() + # Manually default to info before overriding with arguments. + # If this is not done then info logging does not work in the cases where other modes are not specified. + bt.logging.set_info() bt.logging(config=self.config) bt.logging.info(f"Starting validator with config: {self.config}") @@ -172,7 +175,7 @@ def __init__(self): self._new_wandb_run() # === Running args === - self.weights = torch.zeros_like(torch.tensor(self.metagraph.S)) + self.weights = torch.zeros_like(torch.from_numpy(self.metagraph.S)) self.global_step = 0 self.last_epoch = self.metagraph.block.item() @@ -713,7 +716,7 @@ async def _try_set_weights(): netuid=self.config.netuid, wallet=self.wallet, uids=uids, - weights=self.weights, + weights=self.weights.numpy(), wait_for_inclusion=False, version_key=constants.weights_version_key, ) @@ -722,15 +725,6 @@ async def _try_set_weights(): except: bt.logging.warning("Failed to set weights. Trying again later.") - ws, ui = self.weights.topk(len(self.weights)) - table = Table(title="All Weights") - table.add_column("uid", justify="right", style="cyan", no_wrap=True) - table.add_column("weight", style="magenta") - for index, weight in list(zip(ui.tolist(), ws.tolist())): - table.add_row(str(index), str(round(weight, 4))) - console = Console() - console.print(table) - try: bt.logging.debug(f"Setting weights.") await asyncio.wait_for(_try_set_weights(), ttl) @@ -854,8 +848,6 @@ async def run_step(self): bt.logging.trace(f"Current block: {cur_block}") - - if cur_block < constants.BLOCK_STACK_V2_DEDUP: dataset_by_competition_id = constants.DATASET_BY_COMPETITION_ID else: @@ -1232,7 +1224,7 @@ def _compute_and_set_competition_weights( # Fill in metagraph sized tensor with the step weights of the evaluated models. with self.metagraph_lock: - competition_weights = torch.zeros_like(self.metagraph.S) + competition_weights = torch.zeros_like(torch.from_numpy(self.metagraph.S)) for i, uid_i in enumerate(uids): competition_weights[uid_i] = step_weights[i] diff --git a/requirements.txt b/requirements.txt index f38ab49..dc670c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch==2.4.1 -bittensor==6.9.4 +bittensor==8.4.3 huggingface-hub==0.25.2 matplotlib==3.9.2 pydantic==1.10 @@ -12,4 +12,4 @@ wandb==0.18.3 datasets==3.0.1 flash-attn==2.6.3 smart-open[s3]==7.0.5 -taoverse==1.0.9 +taoverse==1.3.0 From a6c4865960e7420d51ac5c6a617fbc730edf4db3 Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 19:36:50 -0800 Subject: [PATCH 02/12] Align numpy to bittensor requirements. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index dc670c1..6c6db92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ pydantic==1.10 python-dotenv==1.0.1 rich==13.9.2 safetensors==0.4.5 -numpy==2.1.2 +numpy==2.0.1 transformers==4.44.1 wandb==0.18.3 datasets==3.0.1 From bb94a4459dd59ce0826fdadc3c0bc73873804379 Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 19:38:54 -0800 Subject: [PATCH 03/12] Remove legacy pydantic requirement. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6c6db92..6b27e3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ torch==2.4.1 bittensor==8.4.3 huggingface-hub==0.25.2 matplotlib==3.9.2 -pydantic==1.10 python-dotenv==1.0.1 rich==13.9.2 safetensors==0.4.5 From 205eba60f8b2d6ab6d15237176db28c1fbfb2ad7 Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 20:09:58 -0800 Subject: [PATCH 04/12] Bump taoverse to 1.3.1. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6b27e3b..febd768 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ wandb==0.18.3 datasets==3.0.1 flash-attn==2.6.3 smart-open[s3]==7.0.5 -taoverse==1.3.0 +taoverse==1.3.1 From 17f0985a81e513ca649194cc7cb99599d5588e12 Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 20:51:04 -0800 Subject: [PATCH 05/12] Cleanup overly verbose logging. --- neurons/validator.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/neurons/validator.py b/neurons/validator.py index b533e40..f831f03 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -27,6 +27,7 @@ import datetime as dt import functools import json +import logging import math import os import pickle @@ -37,6 +38,8 @@ from collections import defaultdict import bittensor as bt +from bittensor.utils.btlogging.helpers import all_loggers +from bittensor.utils.btlogging.defines import BITTENSOR_LOGGER_NAME import torch import wandb @@ -131,6 +134,11 @@ def __init__(self): bt.logging.set_info() bt.logging(config=self.config) + # Setting logging level on bittensor messes with all loggers, which we don't want, so set explicitly to warning here. + for logger in all_loggers(): + if not logger.name.startswith(BITTENSOR_LOGGER_NAME): + logger.setLevel(logging.WARNING) + bt.logging.info(f"Starting validator with config: {self.config}") # === Bittensor objects ==== From d6cc9b4f012750f4150c7c1684e01e0c6fe30fb9 Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 21:18:43 -0800 Subject: [PATCH 06/12] Delay update loop. --- neurons/validator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neurons/validator.py b/neurons/validator.py index f831f03..1368de9 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -391,6 +391,9 @@ def update_models(self): # Track how recently we checked the list of top models. last_checked_top_models_time = None + # Delay the first update loop until the metagraph has been synced. + time.sleep(60) + # The below loop iterates across all miner uids and checks to see # if they should be updated. while not self.stop_event.is_set(): From 1c7cc5bbcd6fa611bc2500da6596f3a458ff841d Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 22:06:03 -0800 Subject: [PATCH 07/12] Retry getting the block. --- neurons/validator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/neurons/validator.py b/neurons/validator.py index 1368de9..5adc456 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -745,8 +745,12 @@ async def _try_set_weights(): def _get_current_block(self) -> int: """Returns the current block.""" - try: + @retry(tries=5, delay=1, backoff=2) + def _get_block_with_retry(): return self.subtensor.block + + try: + return _get_block_with_retry() except: bt.logging.debug( "Failed to get the latest block from the chain. Using the block from the cached metagraph." From e2c58438bca8752deae76bfcf975969957873b3e Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 3 Dec 2024 22:07:14 -0800 Subject: [PATCH 08/12] Import retry annotation. --- neurons/validator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neurons/validator.py b/neurons/validator.py index 5adc456..13c53f8 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -36,6 +36,7 @@ import traceback import typing from collections import defaultdict +from retry import retry import bittensor as bt from bittensor.utils.btlogging.helpers import all_loggers From 849f6e58e142ecd9278b4e9eeab49c006e0db313 Mon Sep 17 00:00:00 2001 From: Sid Date: Fri, 6 Dec 2024 08:42:05 -0800 Subject: [PATCH 09/12] Bump release version as well. --- constants/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/constants/__init__.py b/constants/__init__.py index 43c9573..131db00 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -34,7 +34,7 @@ # --------------------------------- # Release -__version__ = "4.6.2" +__version__ = "4.6.3" # Validator schema version __validator_version__ = "4.6.0" @@ -173,7 +173,6 @@ epsilon_func=LinearDecay(0.005, 0.0002, 36000), max_bytes=29 * 1024 * 1024 * 1024, ), - } # Schedule of competitions by block. @@ -217,7 +216,6 @@ 0.4, ), ], - ), ] From 2b25449e4e8ea8bc4414b797a958868a63b76579 Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Wed, 11 Dec 2024 23:49:48 +0000 Subject: [PATCH 10/12] Changed s3 url for the stack v2 data loading --- pretrain/dataset.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pretrain/dataset.py b/pretrain/dataset.py index de19704..fbc50bf 100644 --- a/pretrain/dataset.py +++ b/pretrain/dataset.py @@ -12,17 +12,17 @@ from pprint import pprint import os -from dotenv import load_dotenv +from dotenv import load_dotenv load_dotenv() class SubsetLoader(IterableDataset): """Base class for data-specific subset loader classes.""" - + name: str = None # Dataset name rows_base_url: str = "https://datasets-server.huggingface.co/rows" size_base_url: str = "https://datasets-server.huggingface.co/size" max_pages: int = None - + def __init__( self, batch_size=None, @@ -78,9 +78,9 @@ def __init__( self._initialize_pages() fetch_attempt += 1 - # Exit if the buffer has at least one batch + # Exit if the buffer has at least one batch if len(self.buffer) >= self.sequence_length: - break + break bt.logging.warning( f"All fetched pages seem to be empty or have an extremely low token count. " @@ -139,14 +139,14 @@ def _fetch_data_for_page(self, page): }) else: self.params["offset"] = page - + self.params["length"] = self.num_rows_per_page - + attempt = 0 while attempt < self.retry_limit: try: response = requests.get( - self.rows_base_url, + self.rows_base_url, params=self.params, headers=self._get_request_headers() ) @@ -183,9 +183,9 @@ def get_page_names(self): """Get page names in consistent format""" if not hasattr(self, 'pages'): return [] - + if isinstance(self.pages[0], tuple): - return [f"{cfg_name}_{num_rows}_{split}" + return [f"{cfg_name}_{num_rows}_{split}" for cfg_name, num_rows, split in self.pages] return self.pages @@ -257,15 +257,15 @@ def __init__(self, **kwargs): aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"]) self.s3_sess = session.client("s3") - + super().__init__(requires_auth=True, **kwargs) - + def _download_row_content(self, blob_id, src_encoding): """Download the row content from S3. """ - - s3_url = f"s3://softwareheritage/content/{blob_id}" + + s3_url = f"https://softwareheritage.s3.amazonaws.com/content/{blob_id}" with smart_open.open(s3_url, "rb", compression=".gz", transport_params={"client": self.s3_sess}) as fin: content = fin.read().decode(src_encoding) @@ -277,7 +277,7 @@ def _get_content_from_row(self, row): content = self._download_row_content(row['row']['blob_id'], row['row']['src_encoding']) return content - + class SubsetFalconLoader(SubsetLoader): max_pages: int = 968000015 @@ -286,14 +286,14 @@ class SubsetFalconLoader(SubsetLoader): class SubsetFineWebEdu2Loader(SubsetLoader): name: str = "HuggingFaceFW/fineweb-edu-score-2" - + def fetch_dataset_configs(self) -> typing.Dict[str, typing.Dict]: """ Fetch dataset configs and their metadata. Returns a dictionary with config names as keys and metadata as values. """ params = dict(dataset=self.name) - + attempt = 0 while attempt < self.retry_limit: try: @@ -385,7 +385,7 @@ def get_random_pages(self, num_pages, initial_offset): split = self.configs_data[config_name]["split"] pages.append((config_name, selected_page_start, split)) return pages - + def fetch_data_to_rows(self, num_pages): """Fetch data and return raw text rows instead of adding to buffer.""" downloaded_pages = set() From efde34b45dff54d9c229b677ddb7108b69d0a735 Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Thu, 12 Dec 2024 00:46:32 +0000 Subject: [PATCH 11/12] Aligned model update limit to number of active competitions --- constants/__init__.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/constants/__init__.py b/constants/__init__.py index 131db00..072d10b 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -173,6 +173,20 @@ epsilon_func=LinearDecay(0.005, 0.0002, 36000), max_bytes=29 * 1024 * 1024 * 1024, ), + CompetitionId.B14_MODEL_MULTI_DATASET: ModelConstraints( + max_model_parameter_size=13_900_000_000, + min_model_parameter_size=13_700_000_000, + sequence_length=4096, + allowed_architectures=ALLOWED_MODEL_TYPES_2, + tokenizer="Xenova/gpt-4", + kwargs={ + "torch_dtype": torch.bfloat16, + "attn_implementation": "flash_attention_2", + }, + eval_block_delay=EVAL_BLOCK_DELAY, + epsilon_func=LinearDecay(0.005, 0.0002, 36000), + max_bytes=29 * 1024 * 1024 * 1024, + ), } # Schedule of competitions by block. @@ -256,7 +270,7 @@ sample_min = 5 # Max number of uids that can be either pending eval or currently being evaluated. # We allow the sample_min per competition + 10 additional models to be held at any one time. -updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID) + 10 +updated_models_limit = sample_min * len(MODEL_CONSTRAINTS_BY_COMPETITION_ID_2) + 10 # time required between updates to the chain. chain_update_cadence = dt.timedelta(minutes=20) # Number of blocks required between retrying evaluation of a model. From 571e4970e350b5de4cc8b6681a9abc81e98b5fe6 Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Thu, 12 Dec 2024 01:24:46 +0000 Subject: [PATCH 12/12] Added comment to explain unused model constraints entry --- constants/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/constants/__init__.py b/constants/__init__.py index 072d10b..1b386a8 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -173,6 +173,13 @@ epsilon_func=LinearDecay(0.005, 0.0002, 36000), max_bytes=29 * 1024 * 1024 * 1024, ), + # This constraint is not actually used, it is added as a copy + # of the 14B-model competition constraint entry. + # This is just to keep the size of the constraint dict equal + # to the number of competitions so `update_models_limit` is + # set correctly below. + # This hack will be removed once native support for multi datasets + # is implemented in a future release. CompetitionId.B14_MODEL_MULTI_DATASET: ModelConstraints( max_model_parameter_size=13_900_000_000, min_model_parameter_size=13_700_000_000,