diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..634fb1e --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,106 @@ +name: "CodeQL Advanced" + +on: + push: + branches: [ "dev", "main" ] + pull_request: + branches: [ "dev", "main" ] + schedule: + - cron: '22 6 * * 5' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + # CodeQL supports the following values for 'language': + # 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # If analyzing a compiled language, you can modify the 'build-mode' (e.g. `full` or `manual`). + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # NOTE: The following steps are optional if you have a Python project + # with dependencies. This sets up Python and caches dependencies to speed up future analyses. + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Cache Python dependencies + uses: actions/cache@v3 + with: + # NOTE: Adjust the path and key if your dependencies are stored elsewhere. + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + # NOTE: This step checks if a requirements.txt file exists before installing. + if: ${{ hashFiles('**/requirements.txt') != '' }} + run: pip install --upgrade pip && pip install -r requirements.txt + + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # + # For more details on CodeQL's query packs, refer to: + # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # + # NOTE: Enabling security-extended and security-and-quality queries can provide more comprehensive analysis. + queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/.gitignore b/.gitignore index de09ad9..26e7eba 100644 --- a/.gitignore +++ b/.gitignore @@ -247,3 +247,6 @@ database.db neurons/Miner/app test_speed_file.dat + +# cert +cert/ \ No newline at end of file diff --git a/README.md b/README.md index bc7f439..10d4231 100644 --- a/README.md +++ b/README.md @@ -42,40 +42,51 @@ python3 -m pip install -e . ## Extra dependencies - Miners -### Hashcat +### Cuda + +To ensure **optimal performance and compatibility**, it is **strongly recommended** to install the **latest available CUDA version** from NVIDIA. ```bash -# Minimal hashcat version >= v6.2.6 -wget https://hashcat.net/files/hashcat-6.2.6.tar.gz -tar xzvf hashcat-6.2.6.tar.gz -cd hashcat-6.2.6/ -make -make install # prefixed by sudo if not in the sudoers -hashcat --version -``` +# Visit NVIDIA's official CUDA download page to get the latest version: +# https://developer.nvidia.com/cuda-downloads -### Cuda +# Select your operating system, architecture, distribution, and version to get the appropriate installer. -```bash -# Recommended cuda version: 12.3 -wget https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda-repo-ubuntu2204-12-3-local_12.3.1-545.23.08-1_amd64.deb -dpkg -i cuda-repo-ubuntu2204-12-3-local_12.3.1-545.23.08-1_amd64.deb -cp /var/cuda-repo-ubuntu2204-12-3-local/cuda-*-keyring.gpg /usr/share/keyrings/ -apt-get update -apt-get -y install cuda-toolkit-12-3 -apt-get -y install -y cuda-drivers +# Example for Ubuntu 22.04 (replace with the latest version as needed): + +# Download the CUDA repository package (update the URL to the latest version) +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-repo-ubuntu2204-latest_amd64.deb + +# Install the CUDA repository package +sudo dpkg -i cuda-repo-ubuntu2204-latest_amd64.deb + +# Import the GPG key +sudo cp /var/cuda-repo-ubuntu2204-latest/cuda-*-keyring.gpg /usr/share/keyrings/ + +# Update the package lists +sudo apt-get update + +# Install CUDA Toolkit and drivers +sudo apt-get -y install cuda-toolkit +sudo apt-get -y install cuda-drivers -# Valid for x64 architecture. Consult nvidia documentation for any other architecture. -export CUDA_VERSION=cuda-12.3 -export PATH=$PATH:/usr/local/$CUDA_VERSION/bin -export LD_LIBRARY_PATH=/usr/local/$CUDA_VERSION/lib64 +# Set environment variables +export CUDA_PATH=/usr/local/cuda +export PATH=$PATH:$CUDA_PATH/bin +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_PATH/lib64 -echo "">>~/.bashrc -echo "PATH=$PATH">>~/.bashrc -echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH">>~/.bashrc +# Persist environment variables by adding them to ~/.bashrc +echo "export CUDA_PATH=/usr/local/cuda" >> ~/.bashrc +echo "export PATH=\$PATH:\$CUDA_PATH/bin" >> ~/.bashrc +echo "export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:\$CUDA_PATH/lib64" >> ~/.bashrc -reboot # Changes might need a restart depending on the system +# Apply the changes +source ~/.bashrc +# Reboot the system to finalize the installation +sudo reboot + +# Verify the installation nvidia-smi nvcc --version @@ -137,24 +148,11 @@ Once you have done so, you can run the miner and validator with the following co ## Running Miner -A dedicated medium article is available [here](https://medium.com/@neuralinternet/how-to-run-a-compute-miner-82498b93e7e1) - -Miners contribute processing resources, notably GPU (Graphics Processing Unit) and CPU (Central Processing Unit) -instances, to facilitate optimal performance in essential GPU and CPU-based computing tasks. The system operates on a -performance-based reward mechanism, where miners are incentivized through a tiered reward structure correlated to the -processing capability of their hardware. High-performance devices are eligible for increased compensation, reflecting -their greater contribution to the network's computational throughput. Emphasizing the integration of GPU instances is -critical due to their superior computational power, particularly in tasks demanding parallel processing capabilities. -Consequently, miners utilizing GPU instances are positioned to receive substantially higher rewards compared to their -CPU counterparts, in alignment with the greater processing power and efficiency GPUs bring to the network. - -The primary contribution of miners lies in providing their resources to the validator. The management of these resources' -reservations is entirely handled on the validator's side. A validator has the capability to allocate and deallocate a miner's -resource based on availability and demand. Currently, the maximum duration of allocation for each reservation is limited to 60 days. -This mechanism guarantees a dynamic and efficient distribution of computational power, accommodating the fluctuating demands across the network. +Miners contribute processing resources, specifically GPU (Graphics Processing Unit) to enable optimal performance in essential GPU-based computing tasks. The system uses a hardware specification-based reward mechanism that incentivizes miners through a tiered structure, with rewards directly correlated to the processing capabilities of their hardware. High-performance devices receive greater compensation, reflecting their significant contributions to the network's overall computational throughput. Detailed scoring metrics and supported GPUs can be found in the config.yaml file under the gpu_scores section. A comprehensive explanation of scoring is provided below in the section titled "Understanding the Score Calculation Process". -Important: It's crucial to ensure that port 4444 is open on the host machine to grant validators access to the allocated resource on the miner. +The primary role of miners is to provide their resources to validators. The allocation and management of these resources are entirely handled on the validator's side. Validators dynamically allocate and deallocate a miner's resources based on availability and network demand. This ensures an efficient and flexible distribution of computational power, meeting the fluctuating needs of the network. +It is important to ensure that port 4444 is open on the host machine or that an alternative open port is specified. This allows validators to access the miner's allocated resources and retrieve GPU specifications seamlessly. Changing the miner's hardware while it is allocated is possible, but it will result in the validator deallocating your miner ```bash # To run the miner @@ -164,94 +162,91 @@ python -m miner.py --subtensor.network # blockchain endpoint you want to connect --wallet.name # name of your wallet --wallet.hotkey # hotkey name of your wallet + --ssh.port # The port you want to provide for allocations, default: 4444 --logging.debug # Run in debug mode, alternatively --logging.trace for trace mode ``` ## Running Validator -Validators hold the critical responsibility of rigorously assessing and verifying the computational capabilities of -miners. This multifaceted evaluation process commences with validators requesting miners to provide comprehensive -performance data, which includes not only processing speeds and efficiencies but also critical metrics like Random -Access Memory (RAM) capacity and disk space availability. +Validators play a crucial role in meticulously evaluating and verifying the computational capabilities of miners. This thorough assessment begins with validators requesting detailed performance data from miners, encompassing hardware specifications, efficiencies, and critical metrics such as Random Access Memory (RAM) capacity and disk space availability. -The inclusion of RAM and disk space measurements is vital, as these components significantly impact the overall -performance and reliability of the miners' hardware. RAM capacity influences the ability to handle large or multiple -tasks simultaneously, while adequate disk space ensures sufficient storage. +The inclusion of RAM and disk space metrics is essential, as these components significantly influence the overall performance and reliability of a miner's hardware. RAM capacity determines the ability to manage large or multiple tasks simultaneously, while sufficient disk space ensures adequate storage for sustained operations. -Following the receipt of this detailed hardware and performance information, validators proceed to test the miners' -computational integrity. This is achieved by presenting them with complex hashing challenges, designed to evaluate the -processing power and reliability of the miners' systems. Validators adjust the difficulty of these problems based on the -comprehensive performance profile of each miner, including their RAM and disk space metrics. +Once this comprehensive hardware and performance information is received, validators test the computational integrity of miners using torch-based computational tasks, such as matrix multiplications. These tests are designed to accurately determine the hardware specifications and performance capabilities of the miners' systems. -In addition to measuring the time taken by miners to resolve these problems, validators meticulously verify the accuracy -of the responses. This thorough examination of both speed and precision, complemented by the assessment of RAM and disk -space utilization, forms the crux of the evaluation process. +Based on the results of this hardware identification process, validators update the miners' scores. These scores determine the miners' weight within the network, directly affecting their potential rewards and standing in the system. -Based on this extensive analysis, validators update the miners' scores, reflecting a holistic view of their -computational capacity, efficiency, and hardware quality. This score then determines the miner's weight within the -network, directly influencing their potential rewards and standing. -This scoring process, implemented through a Python script, considers various factors including CPU, GPU, hard disk, and -RAM performance. The script's structure and logic are outlined below: +```bash +# To run the validator +cd neurons +python -m validator.py + --netuid # The subnet id you want to connect to + --subtensor.network # blockchain endpoint you want to connect + --wallet.name # name of your wallet + --wallet.hotkey # hotkey name of your wallet + --logging.debug # Run in debug mode, alternatively --logging.trace for trace mode +``` ## Understanding the Score Calculation Process -**The scoring system has been updated, if you want to check the old hardware mechanism:** [Hardware scoring](docs/hardware_scoring.md) - -The score calculation function determines a miner's performance based on various factors: - -**Successful Problem Resolution**: The success rate of solving challenges in the last 24 hours. Score range: (0,100). +**The scoring system has been updated!** -**Problem Difficulty**: This measures the complexity of the solved tasks. The code restricts this difficulty to a minimum and maximum allowed value. Score range: (0,100). +The score calculation function now determines a miner's performance primarily based on their GPU hardware and resource allocation. Only the GPUs listed below are supported and scored correctly. -**Elapsed Time**: The time taken to solve the problem impacts the score. A shorter time results in a higher score. Score range: (0,100). +**GPU Base Scores**: The following GPUs are assigned specific base scores, reflecting their relative performance: +- NVIDIA H200: 4.00 +- NVIDIA H100 80GB HBM3: 3.30 +- NVIDIA H100: 2.80 +- NVIDIA A100-SXM4-80GB: 1.90 +- NVIDIA A100 80GB PCIe: 1.65 +- NVIDIA L40s: 1.10 +- NVIDIA L40: 1.00 +- NVIDIA RTX 6000 Ada Generation: 0.90 +- NVIDIA RTX A6000: 0.78 +- NVIDIA RTX 4090: 0.68 +- NVIDIA GeForce RTX 3090: 0.43 +- NVIDIA L4: 0.43 +- NVIDIA A40: 0.39 +- NVIDIA RTX A5000: 0.36 +- NVIDIA RTX A4500: 0.34 -**Failure Penalty**: The failure rate of solving the last 20 challenges. Score range: (0,100). +**Scaling Factor**: Determine the highest GPU base score, multiply it by 8 (the maximum number of GPUs), and set this scenario as the 100-point baseline. A scaling factor is derived so that using eight of the top GPU models equals 100 points. -**Allocation Score**: Miners that have allocated machine resources receive the maximum challenge score and an additional allocation score, which is proportional to their average challenge difficulty. Score range: (0,100). +**GPU Score**: Multiply the chosen GPU’s base score by the number of GPUs (up to 8) and by the scaling factor to find the miner’s GPU score (0–100). -**Scoring Weights**: Each score component is weighted with the corresponding weight before being added to the total score. - -- Successful Problem Resolution Weight = 1.0 -- Problem Difficulty Weight = 1.0 -- Elapsed Time Weight = 0.5 -- Failure Penalty Weight = 0.5 -- Allocation Weight = 0.4 +**Allocation Bonus**: If a miner has allocated machine resources, add 100 points to the GPU score, allowing a maximum score of up to 200. **Total Score**: -- Score (not allocated) = (Successful Problem Resolution * Resolution Weight) + (Problem Difficulty * Difficulty Weight) + (Elapsed Time * Time Weight) - (Failure Penalty * Penalty Weight) -- Score (allocated) = Maximum Challenge Score + (Allocation Score * Allocation Weight) +- Score (not allocated) = GPU Score (0–100) +- Score (allocated) = GPU Score + 100 (up to 200) -### Example 1: Miner A's Weighted Total Score +### Example 1: Miner A's Total Score -- **Successful Problem Resolution**: 95% -- **Problem Difficulty**: 7 -- **Elapsed Time**: 4.6 seconds -- **Failure Penalty**: 2.6% +- **GPU**: NVIDIA H200 (Base Score: 3.90) +- **Number of GPUs**: 8 - **Allocation**: True -Total Score = Score (allocated) = 264.6 +Step-by-step calculation: +1. Highest scenario: 3.90 * 8 = 31.2 +2. Scaling factor: 100 / 31.2 ≈ 3.2051 +3. GPU Score: 3.90 * 8 * 3.2051 ≈ 100 +4. Allocation Bonus: 100 + 100 = 200 + +Total Score = 200 -### Example 2: Miner B's Weighted Total Score +### Example 2: Miner B's Total Score -- **Successful Problem Resolution**: 92% -- **Problem Difficulty**: 9 -- **Elapsed Time**: 16 seconds -- **Failure Penalty**: 3.1% +- **GPU**: NVIDIA RTX 4090 (Base Score: 0.69) +- **Number of GPUs**: 2 - **Allocation**: False -Total Score = Score (not allocated) = 193.7 +Step-by-step calculation: +1. Scaling factor (same as above): 3.2051 +2. GPU Score: 0.69 * 2 * 3.2051 ≈ 4.42 +3. No allocation bonus applied. -```bash -# To run the validator -cd neurons -python -m validator.py - --netuid # The subnet id you want to connect to - --subtensor.network # blockchain endpoint you want to connect - --wallet.name # name of your wallet - --wallet.hotkey # hotkey name of your wallet - --logging.debug # Run in debug mode, alternatively --logging.trace for trace mode -``` +Total Score = 4.42 ## Resource Allocation Mechanism @@ -301,7 +296,6 @@ Flags that you can use with the validator script. - `--validator.whitelist.unrecognized`: (Optional) Whitelist the unrecognized miners. Default: False. - `--validator.perform.hardware.query `: (Optional) Perform the specs query - useful to register to a miner's machine. Default: True. -- `--validator.challenge.batch.size `: (Optional) Batch size that perform the challenge queries - For lower hardware specifications you might want to use a different batch_size than default. Keep in mind the lower is the batch_size the longer it will take to perform all challenge queries. Default: 256. - `--validator.specs.batch.size `: (Optional) Batch size that perform the specs queries - For lower hardware specifications you might want to use a different batch_size than default. Keep in mind the lower is the batch_size the longer it will take to perform all challenge queries. Default: 64. - `--validator.force.update.prometheus`: (Optional) Force the try-update of prometheus version. Default: False. - `--validator.whitelist.updated.threshold`: (Optional) Total quorum before starting the whitelist. Default: 60. (%) @@ -310,14 +304,14 @@ Flags that you can use with the validator script. --- -- `--miner.hashcat.path `: (Optional) The path of the hashcat binary. Default: hashcat. -- `--miner.hashcat.workload.profile `: (Optional) Performance to apply with hashcat profile: 1 Low, 2 Economic, 3 High, 4 Insane. Run `hashcat -h` for more information. Default: 3. -- `--miner.hashcat.extended.options `: (Optional) Any extra options you found usefull to append to the hascat runner (I'd perhaps recommend -O). Run `hashcat -h` for more information. Default: ''. - `--miner.whitelist.not.enough.stake`: (Optional) Whitelist the validators without enough stake. Default: False. - `--miner.whitelist.not.updated`: (Optional) Whitelist validators not using the last version of the code. Default: False. - `--miner.whitelist.updated.threshold`: (Optional) Total quorum before starting the whitelist. Default: 60. (%) ## Benchmarking the machine + +**Note**: Starting from v1.6.0, hashcat benchmarking is no longer performed. The information below is provided purely as legacy reference and will be updated in future releases. + ### Benchmarking hashcat's performance directly: ```bash hashcat -b -m 610 @@ -374,8 +368,9 @@ Enter any additional options for hashcat to use. It's recommended to use the ``` ## Troubleshooting -> "I don't receive any request, 'Challenge' or 'Specs', what could be the reason ?" +> "I don't receive any request, 'Challenge' or 'Specs' or 'Allocation', what could be the reason ?" +Starting from v1.6.0, hashcat challenge benchmarking is no longer performed. Most probably you are running into a **network issue**. - check your ports - check your firewall diff --git a/compute/__init__.py b/compute/__init__.py index 0c211e1..f883f7d 100644 --- a/compute/__init__.py +++ b/compute/__init__.py @@ -18,9 +18,9 @@ import string # Define the version of the template module. -__version__ = "1.5.2" -__minimal_miner_version__ = "1.5.1" -__minimal_validator_version__ = "1.5.2" +__version__ = "1.6.0" +__minimal_miner_version__ = "1.6.0" +__minimal_validator_version__ = "1.6.0" version_split = __version__.split(".") __version_as_int__ = (100 * int(version_split[0])) + (10 * int(version_split[1])) + (1 * int(version_split[2])) @@ -33,6 +33,11 @@ # Validators static vars # Time before the specs requests will time out. time unit = seconds specs_timeout = 60 + +# Proof of GPU settings +pog_retry_limit = 30 +pog_retry_interval = 80 # seconds + # Time before the proof of work requests will time out. time unit = seconds pow_timeout = 30 # Initial and minimal proof of work difficulty. Needs benchmark and adjustment. @@ -73,14 +78,14 @@ TRUSTED_VALIDATORS_HOTKEYS = [ "5F4tQyWrhfGVcNhoqeiNsR6KjD4wMZ2kfhLj4oHYuyHbZAc3", # Opentensor Foundation - "5Hddm3iBFD2GLT5ik7LZnT3XJUnRnN8PoeCFgGQgawUVKNm8", # τaosτaτs & Corcel + "5GKH9FPPnWSUoeeTJp19wVtd84XqFW4pyK2ijV2GsFbhTrP1", # τaosτaτs & Corcel "5HEo565WAy4Dbq3Sv271SAi7syBSofyfhhwRNjFNSM2gP9M2", # Foundry "5HK5tp6t2S59DywmHRWPBVJeJ86T61KjurYqeooqj8sREpeN", # Bittensor Guru Podcast "5EhvL1FVkQPpMjZX4MAADcW42i3xPSF1KiCpuaxTYVr28sux", # TAO-Validator.com "5FFApaS75bv5pJHfAp2FVLBj9ZaXuFDjEypsaBNc1wCfe52v", # RoundTable21 "5DvTpiniW9s3APmHRYn8FroUWyfnLtrsid5Mtn5EwMXHN2ed", # FirstTensor - "5HbLYXUBy1snPR8nfioQ7GoA9x76EELzEq9j7F32vWUQHm1x", # Tensorplex - "5CXRfP2ekFhe62r7q3vppRajJmGhTi7vwvb2yr79jveZ282w", # Rizzo - "5HNQURvmjjYhTSksi8Wfsw676b4owGwfLR2BFAQzG7H3HhYf", # Neural Inτerneτ - "5DnXm2tBGAD57ySJv5SfpTfLcsQbSKKp6xZKFWABw3cYUgqg", # Love + "5E4z3h9yVhmQyCFWNbY9BPpwhx4xFiPwq3eeqmBgVF6KULde", # Tensorplex + "5F2CsUDVbRbVMXTh9fAzF9GacjVX7UapvRxidrxe7z8BYckQ", # Rizzo + "5GmvyePN9aYErXBBhBnxZKGoGk4LKZApE4NkaSzW62CYCYNA", # Neural Inτerneτ + "5F27Eqz2PhyMtGMEce898x31DokNqRVxkm5AhDDe6rDGNvoY", # Love ] diff --git a/compute/utils/db.py b/compute/utils/db.py index 52dccab..6e26d75 100644 --- a/compute/utils/db.py +++ b/compute/utils/db.py @@ -41,6 +41,18 @@ def init(self): cursor.execute("CREATE INDEX IF NOT EXISTS idx_uid ON challenge_details (uid)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_ss58_address ON challenge_details (ss58_address)") cursor.execute("CREATE TABLE IF NOT EXISTS wandb_runs (hotkey TEXT PRIMARY KEY, run_id TEXT NOT NULL)") + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS pog_stats ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + hotkey TEXT, + gpu_name TEXT, + num_gpus INTEGER, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (hotkey) REFERENCES miner_details (hotkey) ON DELETE CASCADE + ) + """ + ) self.conn.commit() except Exception as e: diff --git a/compute/wandb/wandb.py b/compute/wandb/wandb.py index bda0ee0..f9a0f1e 100644 --- a/compute/wandb/wandb.py +++ b/compute/wandb/wandb.py @@ -196,10 +196,28 @@ def update_allocated_hotkeys(self, hotkey_list): This function updates the allocated hotkeys on validator side. It's useless to alter this information as it needs to be signed by a valid validator hotkey. """ + self.api.flush() + + # Step 1: Read penalized hotkeys from the file (penalized_hotkeys.json in the root directory) + penalized_hotkeys = [] + try: + with open("penalized_hotkeys.json", 'r') as file: + penalized_hotkeys_data = json.load(file) + penalized_hotkeys = [entry["hotkey"] for entry in penalized_hotkeys_data] # Extract hotkeys + except FileNotFoundError: + bt.logging.trace("Penalized hotkeys file not found.") + except json.JSONDecodeError: + bt.logging.trace("Error decoding JSON from penalized hotkeys file.") + # Update the configuration with the new keys + # update_dict = { + # "allocated_hotkeys": hotkey_list + # } + update_dict = { - "allocated_hotkeys": hotkey_list - } + "allocated_hotkeys": hotkey_list, # Update allocated hotkeys + "penalized_hotkeys_checklist": penalized_hotkeys # Add penalized hotkeys to the config + } self.run.config.update(update_dict, allow_val_change=True) # Track allocated hotkeys over time @@ -213,6 +231,8 @@ def update_penalized_hotkeys_checklist(self, hotkey_list): This function updates the penalized hotkeys checklist on validator side. It's useless to alter this information as it needs to be signed by a valid validator hotkey. """ + self.api.flush() + # Update the configuration with the new keys update_dict = { "penalized_hotkeys_checklist": hotkey_list @@ -230,6 +250,8 @@ def update_penalized_hotkeys(self, hotkey_list): This function updates the allocated hotkeys on validator side. It's useless to alter this information as it needs to be signed by a valid validator hotkey. """ + self.api.flush() + # Update the configuration with the new keys update_dict = { "penalized_hotkeys": hotkey_list @@ -349,7 +371,7 @@ def get_penalized_hotkeys(self, valid_validator_hotkeys, flag): return penalized_keys_list - def get_penalized_hotkeys_checklist(self, valid_validator_hotkeys, flag): + def get_penalized_hotkeys_checklist_bak(self, valid_validator_hotkeys, flag): """ This function gets all penalized hotkeys checklist from all validators. Only relevant for validators. @@ -470,14 +492,16 @@ def verify_run(self, run): return False - def sync_allocated(self, hotkey): - """ - This function syncs the allocated status of the miner with the wandb run. - """ - # Fetch allocated hotkeys - allocated_hotkeys = self.get_allocated_hotkeys([], False) - - if hotkey in allocated_hotkeys: - return True + def get_penalized_hotkeys_checklist(self, valid_validator_hotkeys, flag): + """ This function gets penalized hotkeys checklist from your validator run """ + if self.run: + try: + run_config = self.run.config + penalized_hotkeys_checklist = run_config.get('penalized_hotkeys_checklist') + return penalized_hotkeys_checklist + except Exception as e: + bt.logging.info(f"Run ID: {self.run.id}, Name: {self.run.name}, Error: {e}") + return [] else: - return False + bt.logging.info(f"No run info found") + return [] \ No newline at end of file diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..67774cc --- /dev/null +++ b/config.yaml @@ -0,0 +1,94 @@ +gpu_performance: + GPU_TFLOPS_FP16: + NVIDIA H200: 610 + NVIDIA H100 80GB HBM3: 570 + NVIDIA H100: 330 + NVIDIA A100-SXM4-80GB: 238.8 + NVIDIA A100 80GB PCIe: 197 + NVIDIA A100-SXM4-40GB: 257 + NVIDIA L40s: 171 + NVIDIA L40: 116 + NVIDIA RTX 6000 Ada Generation: 112 + NVIDIA RTX A6000: 48.5 + NVIDIA RTX A5000: 60.0 + NVIDIA RTX A4500: 44.0 + NVIDIA RTX 4000 Ada Generation: 70 + NVIDIA A40: 40.0 + NVIDIA RTX 4090: 157 + NVIDIA GeForce RTX 3090: 66.4 + NVIDIA L4: 51 + GPU_TFLOPS_FP32: + NVIDIA H200: 49.6 + NVIDIA H100 80GB HBM3: 49.0 + NVIDIA H100: 37.2 + NVIDIA A100-SXM4-80GB: 18.2 + NVIDIA A100 80GB PCIe: 16.9 + NVIDIA A100-SXM4-40GB: 18.2 + NVIDIA L40s: 35.5 + NVIDIA L40: 27.0 + NVIDIA RTX 6000 Ada Generation: 26.0 + NVIDIA RTX A6000: 21.28 + NVIDIA GeForce RTX 3090: 21.7 + NVIDIA RTX A5000: 15.8 + NVIDIA RTX A4500: 14.3 + NVIDIA RTX 4000 Ada Generation: 14.6 + NVIDIA A40: 22.8 + NVIDIA RTX 4090: 48.5 + NVIDIA L4: 9 + GPU_AVRAM: + NVIDIA H200: 68.72 + NVIDIA H100 80GB HBM3: 34.36 + NVIDIA H100: 34.36 + NVIDIA A100-SXM4-80GB: 34.36 + NVIDIA A100 80GB PCIe: 34.36 + NVIDIA A100-SXM4-40GB: 17.18 + NVIDIA L40s: 17.18 + NVIDIA L40: 17.18 + NVIDIA RTX 6000 Ada Generation: 17.18 + NVIDIA RTX A6000: 17.18 + NVIDIA RTX A5000: 8.59 + NVIDIA RTX A4500: 8.59 + NVIDIA RTX 4000 Ada Generation: 8.59 + NVIDIA A40: 17.18 + NVIDIA RTX 4090: 8.59 + NVIDIA GeForce RTX 3090: 8.59 + NVIDIA L4: 8.59 + + gpu_tolerance_pairs: + NVIDIA L40: NVIDIA RTX 6000 Ada Generation + NVIDIA RTX 6000 Ada Generation: NVIDIA L40 + NVIDIA RTX 4000 Ada Generation: NVIDIA RTX A5000 + NVIDIA RTX A5000: NVIDIA RTX 4000 Ada Generation + NVIDIA A100 80GB PCIe: NVIDIA A100-SXM4-80GB + NVIDIA A100-SXM4-80GB: NVIDIA A100 80GB PCIe + NVIDIA H100 80GB HBM3: NVIDIA H100 + NVIDIA H100: NVIDIA H100 80GB HBM3 + NVIDIA A40: NVIDIA RTX A6000 + NVIDIA RTX A6000: NVIDIA A40 + + gpu_scores: + NVIDIA H200: 4.0 + NVIDIA H100 80GB HBM3: 3.30 + NVIDIA H100: 2.80 + NVIDIA A100-SXM4-80GB: 1.90 + NVIDIA A100 80GB PCIe: 1.65 + NVIDIA L40s: 1.10 + NVIDIA RTX 6000 Ada Generation: 0.90 + NVIDIA L40: 1.0 + NVIDIA RTX A6000: 0.78 + NVIDIA RTX 4090: 0.68 + NVIDIA A40: 0.39 + NVIDIA GeForce RTX 3090: 0.43 + NVIDIA L4: 0.43 + NVIDIA RTX A5000: 0.36 + NVIDIA RTX A4500: 0.34 + +merkle_proof: + miner_script_path: "neurons/Validator/miner_script_m_merkletree.py" + time_tolerance: 5 + submatrix_size: 512 + hash_algorithm: 'sha256' + pog_retry_limit: 22 + pog_retry_interval: 60 # seconds + max_workers: 64 + max_random_delay: 900 # 900 seconds diff --git a/neurons/Miner/container.py b/neurons/Miner/container.py index 2d9ed9a..52ff3d8 100644 --- a/neurons/Miner/container.py +++ b/neurons/Miner/container.py @@ -22,6 +22,7 @@ import secrets import string import subprocess +import psutil import docker from io import BytesIO @@ -69,7 +70,7 @@ def kill_container(): client.images.prune(filters={"dangling": True}) bt.logging.info("Container was killed successfully") else: - bt.logging.info("Unable to find container") + bt.logging.info("No running container.") return True except Exception as e: bt.logging.info(f"Error killing container {e}") @@ -93,24 +94,46 @@ def run_container(cpu_usage, ram_usage, hard_disk_usage, gpu_usage, public_key, docker_ssh_port = docker_requirement.get("ssh_port") docker_appendix = docker_requirement.get("dockerfile") + bt.logging.info(f"Image: {docker_image}") + if docker_appendix is None or docker_appendix == "": docker_appendix = "echo 'Hello World!'" - # Step 1: Build the Docker image with an SSH server - dockerfile_content = ( - """ - FROM {} - RUN apt-get update && apt-get install -y openssh-server - RUN mkdir -p /run/sshd && echo 'root:'{}'' | chpasswd - RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ - sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ - sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ - sed -i 's/#ListenAddress 0.0.0.0/ListenAddress 0.0.0.0/' /etc/ssh/sshd_config - RUN {} - RUN mkdir -p /root/.ssh/ && echo '{}' > /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys - CMD ["/usr/sbin/sshd", "-D"] - """.format(docker_image, password, docker_appendix, docker_ssh_key) - ) + # Calculate 90% of free memory for shm_size + available_memory = psutil.virtual_memory().available + shm_size_gb = int(0.9 * available_memory / (1024**3)) # Convert to GB + bt.logging.trace(f"Allocating {shm_size_gb}GB to /dev/shm") + + dockerfile_content = f""" + FROM {docker_image} + + # Install OpenSSH Server + RUN apt-get update && apt-get install -y openssh-server + + # Create SSH directory and set root password + RUN mkdir -p /var/run/sshd && echo 'root:{password}' | chpasswd + + # Configure SSHD to allow root login and password authentication + RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \\ + sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \\ + sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \\ + sed -i 's/#ListenAddress 0.0.0.0/ListenAddress 0.0.0.0/' /etc/ssh/sshd_config + + # Run additional Docker appendix commands + RUN {docker_appendix} + + # Setup SSH authorized keys + RUN mkdir -p /root/.ssh/ && echo '{docker_ssh_key}' > /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys + + # Activate Conda environment on shell startup + RUN echo "source /opt/conda/etc/profile.d/conda.sh && conda activate base" >> /root/.bashrc + + # Ensure PATH includes Conda binaries + ENV PATH=/opt/conda/bin:$PATH + + # Start SSHD + CMD ["/usr/sbin/sshd", "-D"] + """ # Ensure the tmp directory exists within the current directory tmp_dir_path = os.path.join('.', 'tmp') @@ -139,6 +162,7 @@ def run_container(cpu_usage, ram_usage, hard_disk_usage, gpu_usage, public_key, environment=["NVIDIA_VISIBLE_DEVICES=all"], ports={22: docker_ssh_port}, init=True, + shm_size=f"{shm_size_gb}g", # Set the shared memory size to 2GB restart_policy={"Name": "on-failure", "MaximumRetryCount": 3}, # volumes={ docker_volume: {'bind': '/root/workspace/', 'mode': 'rw'}}, ) @@ -202,30 +226,41 @@ def password_generator(length): random_str = "".join(secrets.choice(alphabet) for _ in range(length)) return random_str - def build_check_container(image_name: str, container_name: str): - client = docker.from_env() - dockerfile = ''' - FROM alpine:latest - CMD echo "compute-subnet" - ''' try: + client = docker.from_env() + dockerfile = ''' + FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime + CMD echo "compute-subnet" + ''' + # Create a file-like object from the Dockerfile f = BytesIO(dockerfile.encode('utf-8')) # Build the Docker image + bt.logging.info("Building the Docker image... this may take a few minutes during the initial installation.") image, _ = client.images.build(fileobj=f, tag=image_name) + bt.logging.trace(f"Docker image '{image_name}' built successfully.") # Create the container from the built image container = client.containers.create(image_name, name=container_name) + bt.logging.trace(f"Container '{container_name}' created successfully.") return container - except docker.errors.BuildError: + + except docker.errors.BuildError as e: pass - except docker.errors.APIError: + except docker.errors.APIError as e: pass + except Exception as e: + bt.logging.error( + "Insufficient permissions to execute Docker commands. Please ensure the current user is added to the 'docker' group " + "and has the necessary privileges. Run 'sudo usermod -aG docker $USER' and restart your session." + ) finally: - client.close() - + try: + client.close() + except Exception as close_error: + bt.logging.warning(f"Error closing the Docker client: {close_error}") def build_sample_container(): """ @@ -244,19 +279,46 @@ def build_sample_container(): password = password_generator(10) # Step 1: Build the Docker image with an SSH server - dockerfile_content = ( - """ - FROM ubuntu - RUN apt-get update && apt-get install -y openssh-server - RUN mkdir -p /run/sshd && echo 'root:'{}'' | chpasswd - RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ - sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ - sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ - sed -i 's/#ListenAddress 0.0.0.0/ListenAddress 0.0.0.0/' /etc/ssh/sshd_config - RUN mkdir -p /root/.ssh/ && echo '{}' > /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys - CMD ["/usr/sbin/sshd", "-D"] - """.format(password, "") - ) + # Step 1: Build the Docker image with SSH server and install numpy + dockerfile_content = f""" + FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime + + # Prevent interactive prompts during package installation + ENV DEBIAN_FRONTEND=noninteractive + + # Install SSH server and necessary packages + RUN apt-get update && \\ + apt-get install -y --no-install-recommends openssh-server python3-pip build-essential && \\ + mkdir /var/run/sshd && \\ + echo 'root:{password}' | chpasswd && \\ + sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \\ + sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \\ + sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \\ + sed -i 's/#ListenAddress 0.0.0.0/ListenAddress 0.0.0.0/' /etc/ssh/sshd_config + RUN mkdir -p /root/.ssh/ && echo '{""}' > /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys + + # Install numpy + RUN pip3 install --upgrade pip && \\ + pip3 install numpy==1.24.3 && \\ + apt-get clean && \\ + rm -rf /var/lib/apt/lists/* + + # Start SSH daemon + CMD ["/usr/sbin/sshd", "-D"] + """ + # dockerfile_content = ( + # """ + # FROM ubuntu + # RUN apt-get update && apt-get install -y openssh-server + # RUN mkdir -p /run/sshd && echo 'root:'{}'' | chpasswd + # RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + # sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ + # sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ + # sed -i 's/#ListenAddress 0.0.0.0/ListenAddress 0.0.0.0/' /etc/ssh/sshd_config + # RUN mkdir -p /root/.ssh/ && echo '{}' > /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys + # CMD ["/usr/sbin/sshd", "-D"] + # """.format(password, "") + # ) # Ensure the tmp directory exists within the current directory tmp_dir_path = os.path.join('.', 'tmp') @@ -296,10 +358,10 @@ def restart_container(): running_container.restart() return {"status": True} else: - bt.logging.info("Unable to find container") + bt.logging.info("No running container.") return {"status": False} except Exception as e: - bt.logging.info(f"Error restart container {e}") + bt.logging.info(f"Error restart container: {e}") return {"status": False} def pause_container(): diff --git a/neurons/Validator/calculate_pow_score.py b/neurons/Validator/calculate_pow_score.py index e2e011c..667e8f0 100644 --- a/neurons/Validator/calculate_pow_score.py +++ b/neurons/Validator/calculate_pow_score.py @@ -29,111 +29,34 @@ def normalize(val, min_value, max_value): return (val - min_value) / (max_value - min_value) - def prevent_none(val): return 0 if not val else val - -# Calculate score based on the performance information -def calc_score(response, hotkey, allocated_hotkeys, penalized_hotkeys, validator_hotkeys, mock=False): - """ - Method to calculate the score attributed to this miner dual uid - hotkey - :param response: - { - 'challenge_attempts': 7, - 'challenge_successes': 6, - 'challenge_failed': 0, - 'challenge_elapsed_time_avg': 5.804196675618489, - 'challenge_difficulty_avg': 2.0, - 'has_docker': True, - } - challenge_failed is batched over the last 10 challenges only - :param hotkey: - :param mock: During testing phase - :return: - """ +def calc_score_pog(gpu_specs, hotkey, allocated_hotkeys, config_data, mock=False): try: - challenge_attempts = prevent_none(response.get("challenge_attempts",1)) - challenge_successes = prevent_none(response.get("challenge_successes",0)) - last_20_challenge_failed = prevent_none(response.get("last_20_challenge_failed",0)) - challenge_elapsed_time_avg = prevent_none(response.get("challenge_elapsed_time_avg", compute.pow_timeout)) - challenge_difficulty_avg = prevent_none(response.get("last_20_difficulty_avg", compute.pow_min_difficulty)) - has_docker = response.get("has_docker", False) - - # Define base weights for the PoW - success_weight = 1 - difficulty_weight = 1 - time_elapsed_weight = 0.3 - failed_penalty_weight = 0.4 - allocation_weight = 0.21 - - # Just in case but in theory, it is not possible to fake the difficulty as it is sent by the validator - # Same occurs for the time, it's calculated by the validator so miners cannot fake it - - # Difficulty, score range: [0,100] * difficulty_weight - difficulty_val = max(min(challenge_difficulty_avg, compute.pow_max_difficulty),compute.pow_min_difficulty) - difficulty_modifier = percent(difficulty_val,compute.pow_max_difficulty) - - difficulty = difficulty_modifier * difficulty_weight - - # Success ratio, score range: [0,100] * success_weight - successes_ratio = percent(challenge_successes, challenge_attempts) - successes = successes_ratio * success_weight - - # Time elapsed, score range: [0,100] * time_elapsed_weight - # Modifier for elapsed time effect - time_elapsed_modifier = percent_yield(challenge_elapsed_time_avg, compute.pow_timeout) - time_elapsed = time_elapsed_modifier * time_elapsed_weight - - # Failed penalty, score range [0,100] * failed_penalty_weight - # Failed penalty has exponential weigt, the higher the failure rate, the higher the penalty - failed_penalty_exp = 1.5 - last_20_challenge_failed_modifier = percent(last_20_challenge_failed, 20) #Normalize with defined limits (0,100) - failed_penalty = failed_penalty_weight * (last_20_challenge_failed_modifier/100)**failed_penalty_exp*100 - - # Allocation, score range [0, 100] * allocation_weight - # The score for allocation is proportional to the average difficulty reached before allocation - allocation_score = difficulty_modifier * allocation_weight - allocation_status = hotkey in allocated_hotkeys - - # Calculate the score - max_score_challenge = 100 * (success_weight + difficulty_weight + time_elapsed_weight) - max_score_allocation = 100 * allocation_weight - max_score = max_score_challenge + max_score_allocation - final_score = difficulty + successes + time_elapsed - failed_penalty - - # Docker and specs penalty - penalty = not(has_docker) - - if allocation_status: - final_score = max_score_challenge * (1-allocation_weight) + allocation_score - else: - final_score = difficulty + successes + time_elapsed - failed_penalty - if penalty: - final_score = final_score/2 + gpu_data = config_data["gpu_performance"] + gpu_scores = gpu_data.get("gpu_scores", {}) + # Get the GPU with the maximum score + max_gpu = max(gpu_scores, key=gpu_scores.get) + max_score = gpu_scores[max_gpu]*8 + score_factor = 100/max_score - if (last_20_challenge_failed >= 19 or challenge_successes == 0) and not allocation_status: - return 0 + gpu_name = gpu_specs.get("gpu_name") + num_gpus = min(gpu_specs.get("num_gpus"), 8) - # Penalize miners if their hotkey is in the penalized_hotkeys list - if hotkey in penalized_hotkeys: - # Calculate the penalty factor based on the proportion of penalized hotkeys - penalty_count = penalized_hotkeys.count(hotkey) - half_validators = len(validator_hotkeys) / 2 + # Get GPU score + score = gpu_scores.get(gpu_name) * num_gpus * score_factor - # If penalty count equals or exceeds half of the validators, set score to 0 - if penalty_count >= half_validators: - final_score = 0 - else: - penalty_multiplier = max(1 - (penalty_count / half_validators), 0) - final_score *= penalty_multiplier + # Add allocation score, i.e. max un-allocated score = 100 + if hotkey in allocated_hotkeys: + score += 100 - # Final score is > 0 - final_score = max(0, final_score) + # Logging score + bt.logging.info(f"Score - {hotkey}: {score:.2f}/200") # Normalize the score - normalized_score = normalize(final_score, 0, max_score) + normalized_score = normalize(score, 0, 200) return normalized_score except Exception as e: bt.logging.error(f"An error occurred while calculating score for the following hotkey - {hotkey}: {e}") - return 0 + return 0 \ No newline at end of file diff --git a/neurons/Validator/database/miner.py b/neurons/Validator/database/miner.py index c7e3f39..1654e9f 100644 --- a/neurons/Validator/database/miner.py +++ b/neurons/Validator/database/miner.py @@ -79,6 +79,10 @@ def purge_miner_entries(db: ComputeDb, uid: int, hotkey: str): "DELETE FROM challenge_details WHERE uid = ? AND ss58_address = ?", (uid, hotkey), ) + cursor.execute( + "DELETE FROM miner_details WHERE hotkey = ?", + (hotkey,), + ) db.conn.commit() if cursor.rowcount > 0: diff --git a/neurons/Validator/database/pog.py b/neurons/Validator/database/pog.py new file mode 100644 index 0000000..a3bdba1 --- /dev/null +++ b/neurons/Validator/database/pog.py @@ -0,0 +1,97 @@ +# The MIT License (MIT) +# Copyright © 2023 Rapiiidooo +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the “Software”), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +import datetime + +import bittensor as bt + +from compute.utils.db import ComputeDb + +def update_pog_stats(db: ComputeDb, hotkey, gpu_name, num_gpus): + """ + Inserts a new GPU spec entry for a given hotkey and ensures that only + the latest three entries are retained. + + :param hotkey: The miner's hotkey identifier. + :param gpu_name: The name/model of the GPU. + :param num_gpus: The number of GPUs. + """ + cursor = db.get_cursor() + try: + # Insert the new GPU spec + cursor.execute( + """ + INSERT INTO pog_stats (hotkey, gpu_name, num_gpus) + VALUES (?, ?, ?) + """, + (hotkey, gpu_name, num_gpus) + ) + + # Delete older entries if more than 4 exist for the hotkey + cursor.execute( + """ + DELETE FROM pog_stats + WHERE id NOT IN ( + SELECT id FROM pog_stats + WHERE hotkey = ? + ORDER BY created_at DESC + LIMIT 4 + ) + AND hotkey = ? + """, + (hotkey, hotkey) + ) + + db.conn.commit() + # bt.logging.info(f"Updated pog_stats for hotkey: {hotkey}") + except Exception as e: + db.conn.rollback() + # bt.logging.error(f"Error updating pog_stats for {hotkey}: {e}") + finally: + cursor.close() + +def get_pog_specs(db: ComputeDb, hotkey): + """ + Retrieves the most recent GPU spec entry for a given hotkey where gpu_name is not None. + + :param hotkey: The miner's hotkey identifier. + :return: A dictionary with 'gpu_name' and 'num_gpus' or None if no valid entries exist. + """ + cursor = db.get_cursor() + try: + cursor.execute( + """ + SELECT gpu_name, num_gpus + FROM pog_stats + WHERE hotkey = ? AND gpu_name IS NOT NULL AND num_gpus IS NOT NULL + ORDER BY created_at DESC + LIMIT 1 + """, + (hotkey,) + ) + row = cursor.fetchone() + if row: + gpu_name, num_gpus = row + # bt.logging.info(f"Retrieved pog_stats for hotkey {hotkey}: GPU Name={gpu_name}, Num GPUs={num_gpus}") + return {"gpu_name": gpu_name, "num_gpus": num_gpus} + else: + # bt.logging.warning(f"No valid pog_stats found for hotkey {hotkey}") + return None + except Exception as e: + # bt.logging.error(f"Error retrieving pog_stats for {hotkey}: {e}") + return None + finally: + cursor.close() diff --git a/neurons/Validator/miner_script_m_merkletree.py b/neurons/Validator/miner_script_m_merkletree.py new file mode 100644 index 0000000..2342e72 --- /dev/null +++ b/neurons/Validator/miner_script_m_merkletree.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import torch +import time +import sys +import os +import numpy as np +import hashlib +from multiprocessing.pool import ThreadPool +from concurrent.futures import ThreadPoolExecutor, as_completed +import argparse +import json +import gc + +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512" + +import subprocess +import sys + +def get_gpu_info(): + """ + Detect the number and types of GPUs available on the system. + + Returns: + dict: Dictionary containing the number of GPUs and their names. + """ + if not torch.cuda.is_available(): + return {"num_gpus": 0, "gpu_names": []} + + num_gpus = torch.cuda.device_count() + gpu_names = [torch.cuda.get_device_name(i) for i in range(num_gpus)] + + gpu_info = {"num_gpus": num_gpus, "gpu_names": gpu_names} + + print(json.dumps(gpu_info, indent=2)) + +def estimate_vram_size(buffer_factor=0.9, precision="fp16"): + dtype = torch.float16 if precision == "fp16" else torch.float32 + element_size = 2 if precision == "fp16" else 4 # Size of each element in bytes + total_elements = 1024 * 1024 # Start with a 1MB array + + try: + while True: + arr = torch.empty((total_elements,), dtype=dtype, device="cuda") + total_elements *= 2 + except RuntimeError: + total_elements //= 2 # Step back to last successful allocation + vram_bytes = total_elements * element_size + usable_vram = vram_bytes / (buffer_factor * 1e9) # Convert to GB + return usable_vram + +def adjust_matrix_size(vram, element_size=2, buffer_factor=0.8): + usable_vram = vram * buffer_factor * 1e9 # Usable VRAM in bytes + max_size = int((usable_vram / (2 * element_size)) ** 0.5) # Max size fitting in VRAM + aligned_size = (max_size // 32) * 32 # Ensure alignment to multiple of 32 + return aligned_size + +def get_seeds(): + """Read n and seeds from /tmp/seeds.txt.""" + if not os.path.exists('/tmp/seeds.txt'): + print("Seeds file not found.") + sys.exit(1) + with open('/tmp/seeds.txt', 'r') as f: + content = f.read().strip() + lines = content.split('\n') + n = int(lines[0]) + seeds = {} + for line in lines[1:]: + gpu_id, s_A, s_B = line.strip().split() + gpu_id = int(gpu_id) + s_A = int(s_A) + s_B = int(s_B) + seeds[gpu_id] = (s_A, s_B) + return n, seeds + +def get_challenge_indices(): + """Read challenge indices from /tmp/challenge_indices.txt.""" + if not os.path.exists('/tmp/challenge_indices.txt'): + print("Challenge indices file not found.") + sys.exit(1) + with open('/tmp/challenge_indices.txt', 'r') as f: + content = f.read().strip() + lines = content.split('\n') + indices = {} + for line in lines: + gpu_id, idx_str = line.strip().split() + gpu_id = int(gpu_id) + idx_list = [tuple(map(int, idx.split(','))) for idx in idx_str.split(';')] + indices[gpu_id] = idx_list + return indices + + +def build_merkle_tree_rows(C, hash_func=hashlib.sha256, num_threads=None): + if num_threads is None: + num_threads = 8 + + n = C.shape[0] + + # Hash each row of C using the specified hash function + def hash_row(i): + return hash_func(C[i, :].tobytes()).digest() + + # Parallelize row hashing + with ThreadPool(num_threads) as pool: + leaves = pool.map(hash_row, range(n)) + + tree = leaves.copy() + num_leaves = len(leaves) + offset = 0 + + # Function to hash pairs of nodes using the specified hash function + def hash_pair(i): + left = tree[offset + i] + if i + 1 < num_leaves: + right = tree[offset + i + 1] + else: + right = left # Duplicate if odd number of leaves + return hash_func(left + right).digest() + + # Build the Merkle tree + while num_leaves > 1: + with ThreadPool(num_threads) as pool: + # Process pairs of leaves + new_level = pool.map(hash_pair, range(0, num_leaves, 2)) + tree.extend(new_level) + offset += num_leaves + num_leaves = len(new_level) + + root_hash = tree[-1] + return root_hash, tree + +def get_merkle_proof_row(tree, row_index, total_leaves): + proof = [] + idx = row_index + num_leaves = total_leaves + offset = 0 + while num_leaves > 1: + sibling_idx = idx ^ 1 + if sibling_idx < num_leaves: + sibling_hash = tree[offset + sibling_idx] + else: + sibling_hash = tree[offset + idx] # Duplicate if sibling is missing + proof.append(sibling_hash) + idx = idx // 2 + offset += num_leaves + num_leaves = (num_leaves + 1) // 2 + return proof + +def xorshift32_torch(state): + state = state.type(torch.int64) + x = state & 0xFFFFFFFF + x = x ^ ((x << 13) & 0xFFFFFFFF) + x = x ^ ((x >> 17) & 0xFFFFFFFF) + x = x ^ ((x << 5) & 0xFFFFFFFF) + x = x & 0xFFFFFFFF + return x + +def generate_matrix_torch(s, n): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + dtype = torch.int64 + + # Prepare indices + i_indices = torch.arange(n, dtype=dtype, device=device).repeat_interleave(n) + j_indices = torch.arange(n, dtype=dtype, device=device).repeat(n) + + # Convert s to signed 64-bit integer + s_signed = (s + 2**63) % 2**64 - 2**63 + s_tensor = torch.tensor(s_signed, dtype=dtype, device=device) + + # Modify the computation to prevent overflow + # Since 4294967296 mod 2^32 is 0, we can adjust the computation + i_mod = i_indices % (2**32) + states = (s_tensor + i_mod + j_indices) & 0xFFFFFFFF + + for _ in range(10): + states = xorshift32_torch(states) + + matrix = (states.float() / float(0xFFFFFFFF)).reshape(n, n) + return matrix + +def run_benchmark(): + # Detect number of GPUs + num_gpus = torch.cuda.device_count() + + # Estimate available VRAM + estimated_vram = estimate_vram_size(buffer_factor=1.0, precision="fp16") + + # Adjust matrix sizes + matrix_size_fp16 = adjust_matrix_size(estimated_vram, element_size=2, buffer_factor=1.0) + matrix_size_fp32 = adjust_matrix_size(estimated_vram, element_size=4, buffer_factor=0.5) + + # Run benchmarks + elapsed_time_fp16 = benchmark_matrix_multiplication(matrix_size_fp16, precision="fp16") + elapsed_time_fp32 = benchmark_matrix_multiplication(matrix_size_fp32, precision="fp32") + + # Output results + print(f"{num_gpus} {estimated_vram:.2f} {matrix_size_fp16} {elapsed_time_fp16:.6f} {matrix_size_fp32} {elapsed_time_fp32:.6f}") + +def benchmark_matrix_multiplication(size, precision="fp16"): + dtype = torch.float16 if precision == "fp16" else torch.float32 + A = torch.randn(size, size, dtype=dtype, device="cuda") + B = torch.randn(size, size, dtype=dtype, device="cuda") + + torch.cuda.synchronize() + start_time = time.time() + torch.matmul(A, B) + torch.cuda.synchronize() + elapsed_time = time.time() - start_time + return elapsed_time + +def process_gpu(gpu_id, s_A, s_B, n): + """ + Process computations for a single GPU. + + Args: + gpu_id (int): ID of the GPU to use. + s_A (int): Seed for matrix A. + s_B (int): Seed for matrix B. + n (int): Size of the matrices. + + Returns: + tuple: (root_hash_result, gpu_timing_result) + """ + try: + # Set the current device + torch.cuda.set_device(gpu_id) + device = torch.device(f'cuda:{gpu_id}') + + # Initialize timing dictionary + gpu_timing = {} + + # Step 2: Generate A and B with received seeds using PRNG + start_time_generation = time.time() + + # Clear cache before allocation + torch.cuda.empty_cache() + + # Generate A and B matrices + A_torch = generate_matrix_torch(s_A, n) + B_torch = generate_matrix_torch(s_B, n) + + end_time_generation = time.time() + generation_time = end_time_generation - start_time_generation + gpu_timing['generation_time'] = generation_time + gpu_timing['n'] = n + + # Step 3: Compute C on GPU + start_time_multiplication = time.time() + C_torch = torch.matmul(A_torch, B_torch) + torch.cuda.synchronize(device) # Ensure computation is finished + end_time_multiplication = time.time() + multiplication_time = end_time_multiplication - start_time_multiplication + gpu_timing['multiplication_time'] = multiplication_time + print(f"GPU {gpu_id}: Matrix multiplication time on GPU: {multiplication_time:.2f} seconds") + + # Step 4: Move C back to CPU for Merkle tree construction + start_time_transfer_back = time.time() + C = C_torch.cpu().numpy() + end_time_transfer_back = time.time() + transfer_back_time = end_time_transfer_back - start_time_transfer_back + gpu_timing['transfer_back_time'] = transfer_back_time + # Optional: Uncomment to log transfer time + # print(f"GPU {gpu_id}: Data transfer from GPU time: {transfer_back_time:.2f} seconds") + + # Step 5: Construct Merkle tree over rows of C + start_time_merkle = time.time() + root_hash, merkle_tree = build_merkle_tree_rows(C) + end_time_merkle = time.time() + merkle_tree_time = end_time_merkle - start_time_merkle + gpu_timing['merkle_tree_time'] = merkle_tree_time + # Optional: Uncomment to log Merkle tree construction time and root hash + # print(f"GPU {gpu_id}: Merkle tree over rows construction time: {merkle_tree_time:.2f} seconds") + # print(f"GPU {gpu_id}: Root hash: {root_hash.hex()}") + + # Save root hash and timings + root_hash_result = (gpu_id, root_hash.hex()) + gpu_timing_result = (gpu_id, gpu_timing) + + # Save Merkle tree and C for later proof generation + np.save(f'/dev/shm/merkle_tree_gpu_{gpu_id}.npy', merkle_tree) + np.save(f'/dev/shm/C_gpu_{gpu_id}.npy', C) + + # Free GPU memory + del A_torch, B_torch, C_torch, C, merkle_tree + torch.cuda.empty_cache() + gc.collect() + + return root_hash_result, gpu_timing_result + + except Exception as e: + print(f"Error processing GPU {gpu_id}: {e}") + return None, None + +def run_compute(): + """ + Run compute operations on all available GPUs in parallel. + """ + if not torch.cuda.is_available(): + print("Error: No GPU detected.") + sys.exit(1) + + # Detect number of GPUs + num_gpus = torch.cuda.device_count() + + # Read n and seeds + n, seeds = get_seeds() + + # Initialize lists to store root hashes and timings per GPU + root_hashes = [] + gpu_timings = [] + + # Use ThreadPoolExecutor to parallelize GPU tasks + with ThreadPoolExecutor(max_workers=num_gpus) as executor: + # Submit tasks for each GPU + futures = [] + for gpu_id in range(num_gpus): + s_A, s_B = seeds[gpu_id] + futures.append(executor.submit(process_gpu, gpu_id, s_A, s_B, n)) + + for future in as_completed(futures): + root_hash_result, gpu_timing_result = future.result() + if root_hash_result: + root_hashes.append(root_hash_result) + if gpu_timing_result: + gpu_timings.append(gpu_timing_result) + + # Output root hashes and timings + print(f"Root hashes: {json.dumps(root_hashes)}") + print(f"Timings: {json.dumps(gpu_timings)}") + +def run_proof_gpu(gpu_id, indices, num_gpus): + # Set the GPU device + torch.cuda.set_device(gpu_id) + + # Load data for the specific GPU + gpu_indices = indices[gpu_id] + merkle_tree = np.load(f'/dev/shm/merkle_tree_gpu_{gpu_id}.npy', allow_pickle=True) + C = np.load(f'/dev/shm/C_gpu_{gpu_id}.npy') + + # Start proof generation + start_time_proof = time.time() + responses = {'rows': [], 'proofs': [], 'indices': gpu_indices} + total_leaves = C.shape[0] + + for idx, (i, j) in enumerate(gpu_indices): + row = C[i, :] + proof = get_merkle_proof_row(merkle_tree, i, total_leaves) + responses['rows'].append(row) + responses['proofs'].append(proof) + + end_time_proof = time.time() + proof_time = end_time_proof - start_time_proof + print(f"GPU {gpu_id}: Proof generation time: {proof_time:.2f} seconds") + + # Save responses to shared memory + np.save(f'/dev/shm/responses_gpu_{gpu_id}.npy', responses) + +def run_proof(): + # Get the challenge indices + indices = get_challenge_indices() + num_gpus = torch.cuda.device_count() + + # Use ThreadPoolExecutor for parallel GPU processing + with ThreadPoolExecutor(max_workers=num_gpus) as executor: + futures = [ + executor.submit(run_proof_gpu, gpu_id, indices, num_gpus) + for gpu_id in range(num_gpus) + ] + # Wait for all threads to complete + for future in futures: + future.result() # To raise any exceptions that occurred in the threads + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Miner script for GPU proof.') + parser.add_argument('--mode', type=str, default='benchmark', + choices=['benchmark', 'compute', 'proof', 'gpu_info'], + help='Mode to run: benchmark, compute, proof, or gpu_info') + args = parser.parse_args() + + if args.mode == 'benchmark': + run_benchmark() + elif args.mode == 'compute': + run_compute() + elif args.mode == 'proof': + run_proof() + elif args.mode == 'gpu_info': + get_gpu_info() diff --git a/neurons/Validator/pog.py b/neurons/Validator/pog.py new file mode 100644 index 0000000..8b39073 --- /dev/null +++ b/neurons/Validator/pog.py @@ -0,0 +1,366 @@ +import paramiko +import hashlib +import numpy as np +import os +import time +import blake3 +import secrets # For secure random seed generation +import json +import tempfile +import yaml +import torch +import bittensor as bt + +def load_yaml_config(file_path): + """ + Load GPU performance data from a YAML file. + """ + try: + with open(file_path, "r") as f: + data = yaml.safe_load(f) + return data + except FileNotFoundError: + raise FileNotFoundError(f"The file {file_path} does not exist.") + except yaml.YAMLError as e: + raise ValueError(f"Error decoding YAML file {file_path}: {e}") + +def identify_gpu(fp16_tflops, fp32_tflops, estimated_avram, gpu_data, reported_name=None, tolerance_pairs=None): + """ + Identify GPU based on TFLOPS and AVRAM with a tolerance check for GPUs with similar fingerprints. + + Parameters: + fp16_tflops (float): Measured FP16 TFLOPS. + fp32_tflops (float): Measured FP32 TFLOPS. + estimated_avram (float): Estimated available VRAM in GB. + reported_name (str): GPU name reported by the system (optional). + tolerance_pairs (dict): Dictionary of GPUs with similar performance to apply tolerance adjustments. + + Returns: + str: Identified GPU name with tolerance handling. + """ + tolerance_pairs = tolerance_pairs or {} # Default to empty dict if not provided + GPU_TFLOPS_FP16 = gpu_data["GPU_TFLOPS_FP16"] + GPU_TFLOPS_FP32 = gpu_data["GPU_TFLOPS_FP32"] + GPU_AVRAM = gpu_data["GPU_AVRAM"] + + combined_scores = [] + for gpu in GPU_TFLOPS_FP16.keys(): + fp16_theoretical = GPU_TFLOPS_FP16[gpu] + fp32_theoretical = GPU_TFLOPS_FP32[gpu] + avram_theoretical = GPU_AVRAM[gpu] + + fp16_deviation = abs(fp16_tflops - fp16_theoretical) / fp16_theoretical + fp32_deviation = abs(fp32_tflops - fp32_theoretical) / fp32_theoretical + avram_deviation = abs(estimated_avram - avram_theoretical) / avram_theoretical + + combined_score = (fp16_deviation + fp32_deviation + avram_deviation) / 3 + combined_scores.append((gpu, combined_score)) + + # Sort by the lowest deviation + identified_gpu = sorted(combined_scores, key=lambda x: x[1])[0][0] + + # Tolerance handling for nearly identical GPUs + if reported_name: + # Check if identified GPU matches the tolerance pair + if identified_gpu in tolerance_pairs and reported_name == tolerance_pairs.get(identified_gpu): + bt.logging.trace(f"[Tolerance Adjustment] Detected GPU {identified_gpu} matches reported GPU {reported_name}.") + identified_gpu = reported_name + # Check if reported GPU matches the tolerance pair in reverse + elif reported_name in tolerance_pairs and identified_gpu == tolerance_pairs.get(reported_name): + bt.logging.trace(f"[Tolerance Adjustment] Reported GPU {reported_name} matches detected GPU {identified_gpu}.") + identified_gpu = reported_name + + return identified_gpu + +def compute_script_hash(script_path): + with open(script_path, "rb") as f: + return hashlib.sha256(f.read()).hexdigest() + +def send_script_and_request_hash(ssh_client, script_path): + sftp = ssh_client.open_sftp() + sftp.put(script_path, "/tmp/miner_script.py") + sftp.close() + + # Command to compute the hash on the remote server + hash_command = """ + /opt/conda/bin/python -c " +import hashlib +with open('/tmp/miner_script.py', 'rb') as f: + computed_hash = hashlib.sha256(f.read()).hexdigest() +print(computed_hash) +" + """ + stdin, stdout, stderr = ssh_client.exec_command(hash_command) + computed_hash = stdout.read().decode().strip() + hash_error = stderr.read().decode().strip() + + if hash_error: + raise RuntimeError(f"Hash computation failed: {hash_error}") + return computed_hash + +def execute_script_on_miner(ssh_client, mode): + execution_command = f"/opt/conda/bin/python /tmp/miner_script.py --mode {mode}" + stdin, stdout, stderr = ssh_client.exec_command(execution_command) + execution_output = stdout.read().decode().strip() + execution_error = stderr.read().decode().strip() + + if execution_error: + raise RuntimeError(f"Script execution failed: {execution_error}") + return execution_output + +def parse_benchmark_output(output): + try: + parts = output.strip().split() + num_gpus = int(parts[0]) # First value is the number of GPUs + vram = float(parts[1]) + size_fp16 = int(parts[2]) + time_fp16 = float(parts[3]) + size_fp32 = int(parts[4]) + time_fp32 = float(parts[5]) + return num_gpus, vram, size_fp16, time_fp16, size_fp32, time_fp32 + except (ValueError, IndexError) as e: + raise ValueError(f"Failed to parse execution output: {output}") from e + +def parse_merkle_output(output): + try: + lines = output.strip().split('\n') + root_hashes_line = None + timings_line = None + for line in lines: + if line.startswith('Root hashes:'): + root_hashes_line = line + elif line.startswith('Timings:'): + timings_line = line + if root_hashes_line is None or timings_line is None: + raise ValueError("Output does not contain root hashes or timings") + # Parse root hashes + root_hashes_str = root_hashes_line.split(': ', 1)[1] + root_hashes = json.loads(root_hashes_str) # List of tuples (gpu_id, root_hash) + + # Parse timings + timings_str = timings_line.split(': ', 1)[1] + gpu_timings = json.loads(timings_str) # List of tuples (gpu_id, timings_dict) + + return root_hashes, gpu_timings + except (ValueError, IndexError, json.JSONDecodeError) as e: + raise ValueError(f"Failed to parse execution output: {output}") from e + +def get_random_seeds(num_gpus): + seeds = {} + for gpu_id in range(num_gpus): + s_A = secrets.randbits(64) + s_B = secrets.randbits(64) + seeds[gpu_id] = (s_A, s_B) + return seeds + +def send_seeds(ssh_client, seeds, n): + lines = [str(n)] # First line is n + for gpu_id in seeds.keys(): + s_A, s_B = seeds[gpu_id] + line = f"{gpu_id} {s_A} {s_B}" + lines.append(line) + content = '\n'.join(lines) + command = f"echo '{content}' > /tmp/seeds.txt" + stdin, stdout, stderr = ssh_client.exec_command(command) + stdout.channel.recv_exit_status() + +def send_challenge_indices(ssh_client, indices): + lines = [] + for gpu_id in indices.keys(): + idx_list = indices[gpu_id] + indices_str = ';'.join([f"{i},{j}" for i, j in idx_list]) + line = f"{gpu_id} {indices_str}" + lines.append(line) + content = '\n'.join(lines) + command = f"echo '{content}' > /tmp/challenge_indices.txt" + stdin, stdout, stderr = ssh_client.exec_command(command) + stdout.channel.recv_exit_status() + +def receive_responses(ssh_client, num_gpus): + responses = {} + try: + with ssh_client.open_sftp() as sftp, tempfile.TemporaryDirectory() as temp_dir: + for gpu_id in range(num_gpus): + remote_path = f'/dev/shm/responses_gpu_{gpu_id}.npy' + local_path = f'{temp_dir}/responses_gpu_{gpu_id}.npy' + + try: + sftp.get(remote_path, local_path) + response = np.load(local_path, allow_pickle=True) + responses[gpu_id] = response.item() + except Exception as e: + print(f"Error processing GPU {gpu_id}: {e}") + responses[gpu_id] = None + except Exception as e: + print(f"SFTP connection error: {e}") + + return responses + +def xorshift32_numpy(state): + state = np.uint64(state) + x = state & np.uint64(0xFFFFFFFF) + x ^= (np.uint64((x << np.uint64(13)) & np.uint64(0xFFFFFFFF))) + x ^= (np.uint64((x >> np.uint64(17)) & np.uint64(0xFFFFFFFF))) + x ^= (np.uint64((x << np.uint64(5)) & np.uint64(0xFFFFFFFF))) + x = x & np.uint64(0xFFFFFFFF) + return x + +def generate_prng_value(s, i, j): + s = np.uint64(s) + i = np.uint64(i % np.uint64(2**32)) + j = np.uint64(j) + state = (s + i + j) & np.uint64(0xFFFFFFFF) + + for _ in range(10): + state = xorshift32_numpy(state) + + return state / float(0xFFFFFFFF) + +def verify_responses(seeds, root_hashes, responses, indices, n): + """ + Verifies the responses from GPUs by checking computed values and Merkle proofs. + + Parameters: + seeds (dict): Seeds used for generating PRNG values for each GPU. + root_hashes (dict): Merkle root hashes for each GPU. + responses (dict): Responses from each GPU containing computed rows and proofs. + indices (dict): Challenge indices for each GPU. + n (int): Total number of leaves in the Merkle tree. + + Returns: + bool: True if verification passes within the allowed failure threshold, False otherwise. + """ + verification_passed = True + failed_gpus = [] + num_gpus = len(root_hashes.keys()) + + # Define the minimum number of GPUs that must pass verification + if num_gpus == 4: + required_passes = 3 + elif num_gpus > 4: + # For systems with more than 4 GPUs, adjust the required_passes as needed + # Example: Require at least 75% to pass + required_passes = int(np.ceil(0.75 * num_gpus)) + else: + # For systems with 2 or fewer GPUs, require all to pass + required_passes = num_gpus + + for gpu_id in root_hashes.keys(): + s_A, s_B = seeds[gpu_id] + gpu_indices = indices[gpu_id] + response = responses[gpu_id] + root_hash = root_hashes[gpu_id] + total_leaves = n + + gpu_failed = False # Flag to track if the current GPU has failed + + for idx, (i, j) in enumerate(gpu_indices): + # Generate only the necessary row and column entries using PRNG + A_row = np.array([generate_prng_value(s_A, i, col) for col in range(n)], dtype=np.float32) + B_col = np.array([generate_prng_value(s_B, row, j) for row in range(n)], dtype=np.float32) + + # Compute C_{i,j} as the dot product of A_row and B_col + value_validator = np.dot(A_row, B_col) + + # Retrieve miner's computed value and corresponding Merkle proof + row_miner = response['rows'][idx] + proof = response['proofs'][idx] + value_miner = row_miner[j] + + # Check if the miner's value matches the expected value + if not np.isclose(value_miner, value_validator, atol=1e-5): + bt.logging.trace(f"[Verification] GPU {gpu_id}: Value mismatch at index ({i}, {j}).") + gpu_failed = True + break # Exit the loop for this GPU as it has already failed + + # Verify the Merkle proof for the row + if not verify_merkle_proof_row(row_miner, proof, bytes.fromhex(root_hash), i, total_leaves): + bt.logging.trace(f"[Verification] GPU {gpu_id}: Invalid Merkle proof at index ({i}).") + gpu_failed = True + break # Exit the loop for this GPU as it has already failed + + if gpu_failed: + failed_gpus.append(gpu_id) + bt.logging.trace(f"[Verification] GPU {gpu_id} failed verification.") + else: + bt.logging.trace(f"[Verification] GPU {gpu_id} passed verification.") + + # Calculate the number of GPUs that passed verification + passed_gpus = num_gpus - len(failed_gpus) + + # Determine if verification passes based on the required_passes + if passed_gpus >= required_passes: + verification_passed = True + bt.logging.trace(f"[Verification] SUCCESS: {passed_gpus} out of {num_gpus} GPUs passed verification.") + if len(failed_gpus) > 0: + bt.logging.trace(f" Note: {len(failed_gpus)} GPU(s) failed verification but within allowed threshold.") + else: + verification_passed = False + bt.logging.trace(f"[Verification] FAILURE: Only {passed_gpus} out of {num_gpus} GPUs passed verification.") + if len(failed_gpus) > 0: + bt.logging.trace(f" {len(failed_gpus)} GPU(s) failed verification which exceeds the allowed threshold.") + + return verification_passed + +def verify_merkle_proof_row(row, proof, root_hash, index, total_leaves, hash_func=hashlib.sha256): + """ + Verifies a Merkle proof for a given row. + + Parameters: + - row (np.ndarray): The data row to verify. + - proof (list of bytes): The list of sibling hashes required for verification. + - root_hash (bytes): The root hash of the Merkle tree. + - index (int): The index of the row in the tree. + - total_leaves (int): The total number of leaves in the Merkle tree. + - hash_func (callable): The hash function to use (default: hashlib.sha256). + + Returns: + - bool: True if the proof is valid, False otherwise. + """ + # Initialize the computed hash with the hash of the row using the specified hash function + computed_hash = hash_func(row.tobytes()).digest() + idx = index + num_leaves = total_leaves + + # Iterate through each sibling hash in the proof + for sibling_hash in proof: + if idx % 2 == 0: + # If the current index is even, concatenate computed_hash + sibling_hash + combined = computed_hash + sibling_hash + else: + # If the current index is odd, concatenate sibling_hash + computed_hash + combined = sibling_hash + computed_hash + # Compute the new hash using the specified hash function + computed_hash = hash_func(combined).digest() + # Move up to the next level + idx = idx // 2 + + # Compare the computed hash with the provided root hash + return computed_hash == root_hash + +def adjust_matrix_size(vram, element_size=2, buffer_factor=0.8): + usable_vram = vram * buffer_factor * 1e9 # Usable VRAM in bytes + max_size = int((usable_vram / (2 * element_size)) ** 0.5) # Max size fitting in VRAM + aligned_size = (max_size // 32) * 32 # Ensure alignment to multiple of 32 + return aligned_size + +def get_remote_gpu_info(ssh_client): + """ + Execute the miner script in gpu_info mode to get GPU information from the remote miner. + + Args: + ssh_client (paramiko.SSHClient): SSH client connected to the miner. + + Returns: + dict: Dictionary containing GPU information (number and names). + """ + command = "/opt/conda/bin/python /tmp/miner_script.py --mode gpu_info" + stdin, stdout, stderr = ssh_client.exec_command(command) + + output = stdout.read().decode().strip() + error = stderr.read().decode().strip() + + if error: + raise RuntimeError(f"Failed to get GPU info: {error}") + + return json.loads(output) \ No newline at end of file diff --git a/neurons/miner.py b/neurons/miner.py index 37b2946..57b0656 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -103,7 +103,7 @@ def subtensor(self) -> ComputeSubnetSubtensor: return self._subtensor @property - def metagraph(self) -> bt.metagraph: + def metagraph(self) -> bt.metagraph: # type: ignore return self._metagraph @property @@ -171,8 +171,6 @@ def __init__(self): self.hashcat_workload_profile = self.config.miner_hashcat_workload_profile self.hashcat_extended_options = self.config.miner_hashcat_extended_options - check_hashcat_version(hashcat_path=self.hashcat_path) - self.uids: list = self.metagraph.uids.tolist() self.sync_status() @@ -183,29 +181,27 @@ def __init__(self): self.wandb.update_specs() # check allocation status + self.allocation_status = False self.__check_alloaction_errors() - # Disable the Spec request and replaced with WanDB - # self.request_specs_processor = RequestSpecsProcessor() - self.last_updated_block = self.current_block - (self.current_block % 100) self.allocate_action = False - # if ( - # not self.wandb.sync_allocated(self.wallet.hotkey.ss58_address) - # or not allocation_key_encoded - # ): - # self.miner_http_server = start_server(self.config.ssh.port) def __check_alloaction_errors(self): file_path = "allocation_key" allocation_key_encoded = None + valid_validator_hotkeys = self.get_valid_validator_hotkeys() + + allocated_hotkeys = self.wandb.get_allocated_hotkeys(valid_validator_hotkeys, True) + self.allocation_status = self.wallet.hotkey.ss58_address in allocated_hotkeys + if os.path.exists(file_path): # Open the file in read mode ('r') and read the data with open(file_path, "r") as file: allocation_key_encoded = file.read() if ( - not self.wandb.sync_allocated(self.wallet.hotkey.ss58_address) + not self.allocation_status and allocation_key_encoded ): # Decode the base64-encoded public key from the file @@ -222,6 +218,7 @@ def __check_alloaction_errors(self): bt.logging.info( "Container is already running without allocated. Killing the container." ) + def init_axon(self): # Step 6: Build and link miner functions to the axon. # The axon handles request processing, allowing validators to send this process requests. @@ -396,12 +393,6 @@ def base_priority(self, synapse: typing.Union[Specs, Allocate, Challenge]) -> fl # def priority_specs(self, synapse: Specs) -> float: # return self.base_priority(synapse) + miner_priority_specs - # This is the PerfInfo function, which decides the miner's response to a valid, high-priority request. - # def specs(self, synapse: Specs) -> Specs: - # app_data = synapse.specs_input - # synapse.specs_output = self.request_specs_processor.get_respond(app_data) - # return synapse - # The blacklist function decides if a request should be ignored. def blacklist_allocate(self, synapse: Allocate) -> typing.Tuple[bool, str]: return self.base_blacklist(synapse) @@ -481,7 +472,7 @@ def allocate(self, synapse: Allocate) -> Allocate: else: result = deregister_allocation(public_key) # self.miner_http_server = start_server(self.config.ssh.port) - synapse.output = result + synapse.output = result self.update_allocation(synapse) synapse.output["port"] = int(self.config.ssh.port) return synapse @@ -508,18 +499,18 @@ def challenge(self, synapse: Challenge) -> Challenge: f"{v_id}/{synapse.challenge_difficulty}/{synapse.challenge_hash[10:20]}" ) - result = run_miner_pow( - run_id=run_id, - _hash=synapse.challenge_hash, - salt=synapse.challenge_salt, - mode=synapse.challenge_mode, - chars=synapse.challenge_chars, - mask=synapse.challenge_mask, - hashcat_path=self.hashcat_path, - hashcat_workload_profile=self.hashcat_workload_profile, - hashcat_extended_options=self.hashcat_extended_options, - ) - synapse.output = result + # result = run_miner_pow( + # run_id=run_id, + # _hash=synapse.challenge_hash, + # salt=synapse.challenge_salt, + # mode=synapse.challenge_mode, + # chars=synapse.challenge_chars, + # mask=synapse.challenge_mask, + # hashcat_path=self.hashcat_path, + # hashcat_workload_profile=self.hashcat_workload_profile, + # hashcat_extended_options=self.hashcat_extended_options, + # ) + # synapse.output = result return synapse def get_updated_validator(self): @@ -596,6 +587,15 @@ def get_valid_validator(self) -> typing.List[typing.Tuple[int, str, int]]: valid_validator.append((uid, hotkey, version)) return valid_validator + def get_valid_validator_hotkeys(self): + valid_hotkeys = [] + valid_validator_uids = self.get_valid_validator_uids() + for uid in valid_validator_uids: + neuron = self.subtensor.neuron_for_uid(uid, self.config.netuid) + hotkey = neuron.hotkey + valid_hotkeys.append(hotkey) + return valid_hotkeys + def next_info(self, cond, next_block): if cond: return calculate_next_block_time(self.current_block, next_block) @@ -651,23 +651,9 @@ async def start(self): or block_next_sync_status < self.current_block ): block_next_sync_status = ( - self.current_block + 25 - ) # 25 ~ every 5 minutes + self.current_block + 75 + ) # 75 ~ every 15 minutes self.sync_status() - - # Check port open - # port = int(self.config.ssh.port) - # if port: - # result = check_port('localhost', port) - # if result is True: - # bt.logging.info(f"API: Port {port} on the server is open") - # elif result is False: - # bt.logging.info(f"API: Port {port} on the server is closed") - # else: - # bt.logging.warning(f"API: Could not determine status of port {port} on the server") - # else: - # bt.logging.warning(f"API: Could not find the server port that was provided to validator") - # self.wandb.update_miner_port_open(result) # check allocation status self.__check_alloaction_errors() @@ -695,8 +681,9 @@ async def start(self): f"Consensus: {self.metagraph.C[self.miner_subnet_uid]:.6f} | " f"Incentive: {self.metagraph.I[self.miner_subnet_uid]:.6f} | " f"Emission: {self.metagraph.E[self.miner_subnet_uid]:.6f} | " - f"update_validator: #{block_next_updated_validator} ~ {time_next_updated_validator} | " - f"sync_status: #{block_next_sync_status} ~ {time_next_sync_status}" + #f"update_validator: #{block_next_updated_validator} ~ {time_next_updated_validator} | " + f"Sync_status: #{block_next_sync_status} ~ {time_next_sync_status} | " + f"Allocated: {'Yes' if self.allocation_status else 'No'}" ) time.sleep(5) @@ -723,4 +710,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/neurons/miner_checker.py b/neurons/miner_checker.py new file mode 100644 index 0000000..e6cc5e6 --- /dev/null +++ b/neurons/miner_checker.py @@ -0,0 +1,194 @@ +import argparse +import os +import json +import threading +import time +import base64 +import RSAEncryption as rsa +import bittensor as bt +import paramiko # For SSH functionality +from compute.protocol import Allocate # Allocate is still needed for the actual allocation process +from compute.wandb.wandb import ComputeWandb # Importing ComputeWandb + +VALID_VALIDATOR_HOTKEYS = ["5GmvyePN9aYErXBBhBnxZKGoGk4LKZApE4NkaSzW62CYCYNA"] + +class MinerChecker: + def __init__(self, config): + self.config = config + self.metagraph = self.get_metagraph() # Retrieve metagraph state + self.axons = self.get_miners() # Retrieve the list of axons (miners) + self.validator_challenge_batch_size = 50 + self.threads = [] + self.wandb = ComputeWandb(config, bt.wallet(config=config), "validator.py") # Added ComputeWandb integration + self.penalized_hotkeys_checklist = [] # List of dictionaries for penalized miners + self.allocated_hotkeys = [] # Allocated miners that shouldn't be checked + + def get_metagraph(self): + """Retrieves the metagraph from subtensor.""" + subtensor = bt.subtensor(config=self.config) + return subtensor.metagraph(self.config.netuid) + + def get_miners(self): + """Retrieves a list of miners (axons) from the metagraph.""" + return self.metagraph.axons + + def sync_checklist(self): + """Batch process miners using threads, and generate a new penalized hotkey list.""" + self.threads = [] + self.penalized_hotkeys_checklist.clear() # Reset the penalized list for each run + self.metagraph = self.get_metagraph() # Retrieve metagraph state + self.axons = self.get_miners() # Retrieve the list of axons (miners) + + #Step 1: Fetch allocated hotkeys from wandb with an empty validator list and flag set to False + self.allocated_hotkeys = self.wandb.get_allocated_hotkeys(VALID_VALIDATOR_HOTKEYS, True) # Get allocated miners + # Step 2: Create threads for miners that are NOT allocated + for i in range(0, len(self.axons), self.validator_challenge_batch_size): + for axon in self.axons[i: i + self.validator_challenge_batch_size]: + if axon.hotkey in self.allocated_hotkeys: + bt.logging.info(f"Skipping allocated miner: {axon.hotkey}") + continue # skip this miner since it's allocated + + thread = threading.Thread(target=self.miner_checking_thread, args=(axon,), name=f"th_miner_checking_request-{axon.hotkey}", daemon=True) + self.threads.append(thread) + + # Start and join all threads + for thread in self.threads: + thread.start() + for thread in self.threads: + thread.join() + + # Update penalized hotkeys via wandb + # self.wandb.update_penalized_hotkeys_checklist(self.penalized_hotkeys_checklist) + self.write_penalized_hotkeys_to_file() + + bt.logging.info(f"Length of penalized hotkeys checklist: {len(self.penalized_hotkeys_checklist)}") + + def write_penalized_hotkeys_to_file(self, file_path="penalized_hotkeys.json"): + """Writes the penalized hotkeys checklist to a file on the disk.""" + try: + with open(file_path, 'w') as file: + json.dump(self.penalized_hotkeys_checklist, file, indent=4) + bt.logging.info(f"Penalized hotkeys written to {file_path}") + except Exception as e: + bt.logging.error(f"Error writing penalized hotkeys to file: {e}") + + def penalize_miner(self, hotkey, status_code, description): + """Adds a miner to the penalized list if it's not already penalized.""" + if not any(p['hotkey'] == hotkey for p in self.penalized_hotkeys_checklist): + self.penalized_hotkeys_checklist.append({ "hotkey": hotkey, "status_code": status_code, "description": description}) + bt.logging.info(f"Penalized miner {hotkey}: {status_code} - {description}") + else: + bt.logging.info(f"Miner {hotkey} already penalized, skipping.") + + + def miner_checking_thread(self, axon): + """Handles allocation, SSH access, and deallocation of a miner.""" + wallet = bt.wallet(config=self.config) + dendrite = bt.dendrite(wallet=wallet) + bt.logging.info(f"Quering for miner: {axon.hotkey}") + + is_ssh_access = True + allocation_status = False + private_key, public_key = rsa.generate_key_pair() + + device_requirement = {"cpu": {"count": 1}, "gpu": {}, "hard_disk": {"capacity": 1073741824}, "ram": {"capacity": 1073741824}} + + try: + check_allocation = dendrite.query(axon, Allocate(timeline=30, device_requirement=device_requirement, checking=True,), timeout=30) + + if check_allocation and check_allocation["status"] is True: + bt.logging.info(f"Successfully passed allocaton check: miner {axon.hotkey}") + # Simulate an allocation query with Allocate + response = dendrite.query(axon, Allocate(timeline=1, device_requirement=device_requirement, checking=False, public_key=public_key), timeout=60) + if response and response["status"] is True: + allocation_status = True + bt.logging.info(f"Successfully allocated miner {axon.hotkey}") + private_key = private_key.encode("utf-8") + decrypted_info_str = rsa.decrypt_data(private_key, base64.b64decode(response["info"])) + info = json.loads(decrypted_info_str) + # Use the SSH check function + is_ssh_access = self.check_ssh_login(axon.ip, info['port'], info['username'], info['password']) + else: + # Penalize if the allocation failed + self.penalize_miner(axon.hotkey, "ALLOCATION_FAILED", "Allocation failed during resource allocation") + else: + # Penalize if the allocation failed + self.penalize_miner(axon.hotkey, "ALLOCATION_FAILED", "Allocation check failed: not reachable/running container") + except Exception as e: + bt.logging.error(f"Error during allocation for {axon.hotkey}: {e}") + self.penalize_miner(axon.hotkey, "ALLOCATION_ERROR", f"Error during allocation: {str(e)}") + + # Deallocate resources if allocated, with a max retry count of 3 + retry_count = 0 + max_retries = 3 + while allocation_status and retry_count < max_retries: + try: + # Deallocation query + deregister_response = dendrite.query(axon, Allocate(timeline=0, checking=False, public_key=public_key), timeout=60) + if deregister_response and deregister_response["status"] is True: + allocation_status = False + bt.logging.info(f"Deallocated miner {axon.hotkey}") + break + else: + retry_count += 1 + bt.logging.error(f"Failed to deallocate miner {axon.hotkey} (attempt {retry_count}/{max_retries})") + if retry_count >= max_retries: + bt.logging.error(f"Max retries reached for deallocating miner {axon.hotkey}.") + self.penalize_miner(axon.hotkey, "DEALLOCATION_FAILED", "Failed to deallocate after max retries") + time.sleep(5) + except Exception as e: + retry_count += 1 + bt.logging.error(f"Error while trying to deallocate miner {axon.hotkey} (attempt {retry_count}/{max_retries}): {e}") + if retry_count >= max_retries: + bt.logging.error(f"Max retries reached for deallocating miner {axon.hotkey}.") + self.penalize_miner(axon.hotkey, "DEALLOCATION_FAILED", "Failed to deallocate after max retries") + time.sleep(5) + + if not is_ssh_access: + # Penalize if SSH access fails + self.penalize_miner(axon.hotkey, "SSH_ACCESS_DISABLED", "Failed SSH access") + + def check_ssh_login(self, host, port, username, password): + """Check SSH login using Paramiko.""" + try: + ssh_client = paramiko.SSHClient() + ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + ssh_client.connect(hostname=host, port=port, username=username, password=password, timeout=10) + bt.logging.info(f"SSH login successful for {host}") + return True + except paramiko.AuthenticationException: + bt.logging.error(f"Authentication failed for {host}") + return False + except paramiko.SSHException as ssh_exception: + bt.logging.error(f"Unable to establish SSH connection: {ssh_exception}") + return False + except Exception as e: + bt.logging.error(f"Exception in connecting to the server: {e}") + return False + finally: + ssh_client.close() + +def get_config(): + """Set up configuration using argparse.""" + parser = argparse.ArgumentParser() + parser.add_argument("--netuid", type=int, default=1, help="The chain subnet uid.") + bt.subtensor.add_args(parser) + bt.logging.add_args(parser) + bt.wallet.add_args(parser) + config = bt.config(parser) + # Ensure the logging directory exists + config.full_path = os.path.expanduser( "{}/{}/{}/netuid{}/{}".format( config.logging.logging_dir, config.wallet.name, config.wallet.hotkey, config.netuid, "validator",)) + return config + +def main(): + """Main function to run the miner checker loop.""" + config = get_config() + miner_checker = MinerChecker(config) + + while True: + miner_checker.sync_checklist() + bt.logging.info("Sleeping before next loop...") + time.sleep(900) # Sleep for 10 minutes before re-checking miners + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/neurons/register.py b/neurons/register.py index 3ea2407..e331038 100644 --- a/neurons/register.py +++ b/neurons/register.py @@ -32,7 +32,7 @@ from compute.protocol import Allocate from compute.utils.db import ComputeDb from compute.wandb.wandb import ComputeWandb -from neurons.Validator.database.allocate import select_allocate_miners_hotkey, update_allocation_db, update_blacklist_db, get_miner_details +from neurons.Validator.database.allocate import select_allocate_miners_hotkey, update_allocation_db, get_miner_details, update_blacklist_db from compute.utils.version import get_local_version from compute.utils.db import ComputeDb @@ -344,8 +344,10 @@ def allocate_hotkey(wandb): def deallocate(wandb): - # Get hotkey: - hotkey = input("Enter the hotkey of the resource to de-allocate: ") + # Get hotkey(s): * + hotkeys_input = input("Enter the hotkey(s) to de-allocate (comma-separated for multiple): ") # * + # Split the input by commas and strip any extra whitespace to create a list of hotkeys * + hotkey_list = [hotkey.strip() for hotkey in hotkeys_input.split(',')] # * config = get_config() @@ -376,45 +378,66 @@ def deallocate(wandb): db = ComputeDb() cursor = db.get_cursor() - try: - # Retrieve the allocation details for the given hotkey - cursor.execute("SELECT details, hotkey FROM allocation WHERE hotkey = ?", (hotkey,)) - row = cursor.fetchone() - - if row: - # Parse the JSON string in the 'details' column - info = json.loads(row[0]) - result_hotkey = row[1] - - username = info['username'] - password = info['password'] - port = info['port'] - ip = info['ip'] - regkey = info['regkey'] - - index = metagraph.hotkeys.index(hotkey) - axon = metagraph.axons[index] - deregister_response = dendrite.query( - axon, - Allocate(timeline=0, device_requirement="", checking=False, public_key=regkey), - timeout=60, - ) - if deregister_response and deregister_response["status"] is True: - print("Resource de-allocated successfully.") - else: - print("No Response from axon server, Resource de-allocated successfully .") + debug = False + last_batch = False - update_allocation_db(result_hotkey, info, False) - update_allocation_wandb(wandb) + try: + for hotkey in hotkey_list: # * + # Retrieve the allocation details for the given hotkey + cursor.execute("SELECT details, hotkey FROM allocation WHERE hotkey = ?", (hotkey,)) + row = cursor.fetchone() + + if row: + # Parse the JSON string in the 'details' column + info = json.loads(row[0]) + result_hotkey = row[1] + + username = info['username'] + password = info['password'] + port = info['port'] + ip = info['ip'] + regkey = info['regkey'] + + # Update database and wandb first to mark as deallocated + if not debug: + update_allocation_db(result_hotkey, info, False) # Mark as deallocated immediately + update_allocation_wandb(wandb) # Update `wandb` as well + print(f"Resource with hotkey {hotkey} marked as de-allocated in sql/wandb.") + + # Find the index in metagraph and locate the corresponding axon + try: + index = metagraph.hotkeys.index(hotkey) + axon = metagraph.axons[index] + + deregister_response = None + + if not debug: + deregister_response = dendrite.query( + axon, + Allocate(timeline=0, device_requirement="", checking=False, public_key=regkey), + timeout=60, + ) + if deregister_response and deregister_response["status"] is True: + print(f"Resource with hotkey {hotkey} de-allocated successfully.") # * + else: + print(f"No response from axon server for hotkey {hotkey}.") # * + except ValueError: + print(f"Hotkey {hotkey} not found in the metagraph.") # * + + #update_allocation_db(result_hotkey, info, False) + #update_allocation_wandb(wandb) - else: - print("No allocation details found for the provided hotkey.") + else: + print(f"No allocation details found for hotkey {hotkey}.") # * + print() except Exception as e: print(f"An error occurred during de-allocation: {e}") finally: cursor.close() db.close() + if last_batch: + update_allocation_wandb(wandb) # Update `wandb` as well def list_allocations(wandb): # Instantiate the connection to the db @@ -459,6 +482,47 @@ def list_allocations(wandb): cursor.close() db.close() +def list_allocations_hotkeys(wandb): + + config = get_config() + subtensor = bt.subtensor(config=config) + metagraph = subtensor.metagraph(config.netuid) + + # Instantiate the connection to the db + db = ComputeDb() + cursor = db.get_cursor() + + try: + # Retrieve all records from the allocation table + cursor.execute("SELECT id, hotkey, details FROM allocation") + rows = cursor.fetchall() + + # ANSI escape code for blue text + BLUE = '\033[94m' + RESET = '\033[0m' + print("=" * 80) # Print a separator line for the title + print(f"{BLUE}LIST OF ALLOCATED HOTKEYS IN METAGRAPH{RESET}") + print("=" * 80) # Print a separator line for the title + + # Filter hotkeys based on metagraph + metagraph_hotkeys = [axon.hotkey for axon in metagraph.axons] + filtered_hotkeys = [row[1] for row in rows if row[1] in metagraph_hotkeys] + + if not filtered_hotkeys: + print("No resources allocated that match the metagraph hotkeys.") + else: + for hotkey in filtered_hotkeys: + print(hotkey) + + # Print the total number of hotkeys at the end + print("=" * 80) + print(f"Total hotkeys: {len(filtered_hotkeys)}") + + except Exception as e: + print(f"An error occurred while retrieving allocation details: {e}") + finally: + cursor.close() + db.close() def list_resources(wandb): db = ComputeDb() @@ -704,7 +768,6 @@ def list_penalizations(wandb): else: print("No hotkeys are currently penalized.") - def print_welcome_message(): welcome_text = pyfiglet.figlet_format("Compute Subnet 27", width=120) print(welcome_text) @@ -739,9 +802,9 @@ def main(): parser_list = subparsers.add_parser('list_a', help='List allocated resources') parser_list.set_defaults(func=list_allocations) - # Subparser for the 'list_resources' - parser_list = subparsers.add_parser('list_r', help='List resources') - parser_list.set_defaults(func=list_resources) + # Subparser for the 'list_allocations_hotkeys' + parser_list = subparsers.add_parser('list_ah', help='List allocated resources, hotkeys') + parser_list.set_defaults(func=list_allocations_hotkeys) # Subparser for the 'penalize_hotkey' command parser_penalize_hotkey = subparsers.add_parser('p_hotkey', help='Penalize resource via hotkey') @@ -755,6 +818,10 @@ def main(): parser_list_penalizations = subparsers.add_parser('list_p', help='List penalized hotkeys') parser_list_penalizations.set_defaults(func=list_penalizations) + # Subparser for the 'list_resources' + parser_list = subparsers.add_parser('list_r', help='List resources') + parser_list.set_defaults(func=list_resources) + # Print help before entering the command loop parser.print_help() diff --git a/neurons/register_api.py b/neurons/register_api.py index 1893a5f..0daff4e 100644 --- a/neurons/register_api.py +++ b/neurons/register_api.py @@ -34,9 +34,12 @@ import socket from urllib3.exceptions import InsecureRequestWarning import urllib3 + +from neurons.Validator.database.pog import get_pog_specs urllib3.disable_warnings(InsecureRequestWarning) from dotenv import load_dotenv import math +from ipwhois import IPWhois # Import Compute Subnet Libraries import RSAEncryption as rsa from compute.axon import ComputeSubnetSubtensor @@ -65,15 +68,17 @@ from fastapi.concurrency import run_in_threadpool from pydantic import BaseModel, Field from typing import Optional, Union, List +from compute import (TRUSTED_VALIDATORS_HOTKEYS) # Constants DEFAULT_SSL_MODE = 2 # 1 for client CERT optional, 2 for client CERT_REQUIRED DEFAULT_API_PORT = 8903 # default port for the API DATA_SYNC_PERIOD = 600 # metagraph resync time -ALLOCATE_CHECK_PERIOD = 300 # timeout check period -ALLOCATE_CHECK_COUNT = 120 # maximum timeout count +ALLOCATE_CHECK_PERIOD = 180 # timeout check period +ALLOCATE_CHECK_COUNT = 20 # maximum timeout count MAX_NOTIFY_RETRY = 3 # maximum notify count NOTIFY_RETRY_PERIOD = 15 # notify retry interval +MAX_ALLOCATION_RETRY = 8 PUBLIC_WANDB_NAME = "opencompute" PUBLIC_WANDB_ENTITY = "neuralinternet" @@ -107,6 +112,7 @@ class Allocation(BaseModel): ssh_username: str = "" ssh_password: str = "" ssh_command: str = "" + status: str = "" ssh_key: str = "" uuid_key: str = "" @@ -136,6 +142,8 @@ class Resource(BaseModel): gpu_name: str = "" gpu_capacity: str = "" gpu_count: int = 1 + ip: str = "" + geo: str = "" ram: str = "0" hard_disk: str = "0" allocate_status: str = "" # "Avail." or "Res." @@ -177,7 +185,7 @@ def __init__( wallet: Optional[bt.wallet] = None, subtensor: Optional[bt.subtensor] = None, dendrite: Optional[bt.dendrite] = None, - metagraph: Optional[bt.metagraph] = None, + metagraph: Optional[bt.metagraph] = None, # type: ignore wandb: Optional[ComputeWandb] = None, ): @@ -664,30 +672,31 @@ async def deallocate(hotkey: str, uuid_key: str, request: Request, notify_flag: uuid_key_db = info["uuid"] if uuid_key_db == uuid_key: - index = self.metagraph.hotkeys.index(hotkey) - axon = self.metagraph.axons[index] - run_start = time.time() - retry_count = 0 - - while retry_count < MAX_NOTIFY_RETRY: - allocate_class = Allocate(timeline=0, device_requirement={}, checking=False, public_key=regkey) - deregister_response = await run_in_threadpool( - self.dendrite.query, axon, allocate_class, timeout=60 - ) - run_end = time.time() - time_eval = run_end - run_start - # bt.logging.info(f"API: Stop docker container in: {run_end - run_start:.2f} seconds") + if hotkey in self.metagraph.hotkeys: + index = self.metagraph.hotkeys.index(hotkey) + axon = self.metagraph.axons[index] + run_start = time.time() + retry_count = 0 + + while retry_count < MAX_NOTIFY_RETRY: + allocate_class = Allocate(timeline=0, device_requirement={}, checking=False, public_key=regkey) + deregister_response = await run_in_threadpool( + self.dendrite.query, axon, allocate_class, timeout=60 + ) + run_end = time.time() + time_eval = run_end - run_start + # bt.logging.info(f"API: Stop docker container in: {run_end - run_start:.2f} seconds") - if deregister_response and deregister_response["status"] is True: - bt.logging.info(f"API: Resource {hotkey} deallocated successfully") - break - else: - retry_count += 1 - bt.logging.info(f"API: Resource {hotkey} no response to deallocated signal - retry {retry_count}") - await asyncio.sleep(1) + if deregister_response and deregister_response["status"] is True: + bt.logging.info(f"API: Resource {hotkey} deallocated successfully") + break + else: + retry_count += 1 + bt.logging.info(f"API: Resource {hotkey} no response to deallocated signal - retry {retry_count}") + await asyncio.sleep(1) - if retry_count == MAX_NOTIFY_RETRY: - bt.logging.error(f"API: Resource {hotkey} deallocated successfully without response.") + if retry_count == MAX_NOTIFY_RETRY: + bt.logging.error(f"API: Resource {hotkey} deallocated successfully without response.") deallocated_at = datetime.now(timezone.utc) update_allocation_db(result_hotkey, info, False) @@ -1098,8 +1107,7 @@ async def unpause_docker(hotkey: str, uuid_key: str) -> JSONResponse: else: bt.logging.info(f"API: No allocation details found for the provided hotkey") - return JSON - Response( + return JSONResponse( status_code=status.HTTP_404_NOT_FOUND, content={ "success": False, @@ -1223,7 +1231,7 @@ async def exchange_docker_key(hotkey: str, uuid_key: str, ssh_key: str) -> JSONR finally: cursor.close() db.close() - + @self.app.post( "/list/allocations_sql", tags=["SQLite"], @@ -1297,6 +1305,12 @@ async def list_allocations() -> JSONResponse: ) entry.uuid_key = info["uuid"] entry.ssh_key = info["ssh_key"] + # check the online status in self.checking_allocated + entry.status = "online" + for item in self.checking_allocated: + if item.get("hotkey") == hotkey: + entry.status = "offline" + break allocation_list.append(entry) except Exception as e: @@ -1324,7 +1338,6 @@ async def list_allocations() -> JSONResponse: "data": jsonable_encoder(allocation_list), }, ) - @self.app.post( "/list/resources_sql", tags=["SQLite"], @@ -1492,6 +1505,7 @@ async def list_resources(query: ResourceQuery = None, resource.hard_disk = float(hard_disk) resource.allocate_status = allocate_status resource_list.append(resource) + resource_list = await map_axon_ip_to_resources(resource_list) except (KeyError, IndexError, TypeError, ValueError) as e: bt.logging.error(f"API: Error occurred while filtering resources: {e}") continue @@ -1549,7 +1563,21 @@ async def list_resources(query: ResourceQuery = None, "err_detail": "No resources found.", }, ) - + async def map_axon_ip_to_resources(resources): + """ + Map axon IPs to the list of resources based on hotkeys. + """ + for resource in resources: + hotkey = resource.hotkey + if hotkey: + for axon in self.metagraph.axons: + if axon.hotkey == hotkey: + resource.ip = axon.ip + obj = IPWhois(resource.ip) + result = obj.lookup_rdap() + resource.geo = result.get("asn_country_code","Unknown") + break + return resources async def get_wandb_running_miners(): """ Get the running miners from wandb @@ -1571,27 +1599,32 @@ async def get_wandb_running_miners(): filter_rule, ) penalized_hotkeys = await run_in_threadpool( - self.wandb.get_penalized_hotkeys, [], False + self.wandb.get_penalized_hotkeys_checklist, [], False ) + # bt.logging.info(penalized_hotkeys) + for run in runs: run_config = run.config run_hotkey = run_config.get("hotkey") - running_hotkey.append(run_hotkey) specs = run_config.get("specs") configs = run_config.get("config") is_active = any(axon.hotkey == run_hotkey for axon in self.metagraph.axons) - if is_active: - bt.logging.info(f"DEBUG - This hotkey is active - {run_hotkey}") + #if is_active: + #bt.logging.info(f"DEBUG - This hotkey is active - {run_hotkey}") # check the signature + is_penalized = run_hotkey in penalized_hotkeys + if ( run_hotkey and configs - and run_hotkey not in penalized_hotkeys + and not is_penalized and is_active ): + # bt.logging.info(f"DEBUG - This hotkey is OK - {run_hotkey}") + running_hotkey.append(run_hotkey) if specs: specs_details[run_hotkey] = specs else: @@ -1693,15 +1726,19 @@ async def count_all_model(model: str , cpu_count: Optional[int] = None, ram_size flag = 0 if details : gpu_miner = details["gpu"] - gpu_name = str(gpu_miner["details"][0]["name"]).lower() + gpu_details = gpu_miner.get("details", []) + + # Check if details exist and is non-empty + if gpu_details and isinstance(gpu_details, list) and len(gpu_details) > 0: + gpu_name = str(gpu_details[0].get("name", "")).lower() if model.lower() == gpu_name: if cpu_count is not None: cpu_miner = details["cpu"] - if cpu_miner["count"] == cpu_count: + if cpu_miner.get("count") == cpu_count: flag += 1 elif ram_size is not None: - ram_miner = details["ram"] - ram = ram_miner["total"] / 1024.0 ** 3 + ram_miner = details.get("ram", {}) + ram = ram_miner.get("total", 0) / 1024.0 ** 3 if int(math.ceil(ram)) == int(ram_size): flag += 1 else: @@ -1761,18 +1798,30 @@ async def list_resources_wandb(query: ResourceQuery = None, specs_details,running_hotkey = await get_wandb_running_miners() + bt.logging.info(f"Number of running miners: {len(running_hotkey)}") + # Initialize a dictionary to keep track of GPU instances resource_list = [] gpu_instances = {} total_gpu_counts = {} # Get the allocated hotkeys from wandb - allocated_hotkeys = await run_in_threadpool(self.wandb.get_allocated_hotkeys, [], False) + allocated_hotkeys = await run_in_threadpool(self.wandb.get_allocated_hotkeys, TRUSTED_VALIDATORS_HOTKEYS, True) + bt.logging.info(f"Allocated hotkeys: {allocated_hotkeys}") + bt.logging.info(f"Number of allocated hotkeys: {len(allocated_hotkeys)}") + + db = ComputeDb() + + # penalized_hotkeys = await run_in_threadpool(self.get_penalized_hotkeys_checklist, valid_validator_hotkeys=[], flag=False) if specs_details: # Iterate through the miner specs details and print the table for hotkey, details in specs_details.items(): - if hotkey in running_hotkey: + + miner_older_than = self.miner_is_older_than(db, 48, hotkey) + miner_pog_ok = self.miner_pog_ok((db, 48, hotkey)) + + if hotkey in running_hotkey and miner_pog_ok: if details: # Check if details are not empty resource = Resource() try: @@ -1880,6 +1929,7 @@ async def list_resources_wandb(query: ResourceQuery = None, resource.hard_disk = float(hard_disk) resource.allocate_status = allocate_status resource_list.append(resource) + resource_list = await map_axon_ip_to_resources(resource_list) except (KeyError, IndexError, TypeError, ValueError) as e: bt.logging.error(f"API: Error occurred while filtering resources: {e}") continue @@ -1906,6 +1956,10 @@ async def list_resources_wandb(query: ResourceQuery = None, }, ) else: + bt.logging.info(f"Number of resources returned: {len(resource_list)}") + bt.logging.trace("Resource List Contents:") + for resource in resource_list: + bt.logging.trace(vars(resource)) if page_number: page_size = page_size if page_size else 50 result = self._paginate_list(resource_list, page_number, page_size) @@ -2415,9 +2469,9 @@ async def list_allocated_hotkeys() -> JSONResponse: if not validator_runs: bt.logging.info(f"API: No validator with allocated info in the project opencompute.") return JSONResponse( - status_code=status.HTTP_200_OK, + status_code=status.HTTP_404_NOT_FOUND, content={ - "success": True, + "success": False, "message": "No validator with allocated info in the project opencompute.", "data": {}, }, @@ -2701,34 +2755,55 @@ def _allocate_container_hotkey(self, requirements, hotkey, timeline, public_key, "type": requirements.gpu_type, }, "hard_disk": {"capacity": 1073741824}, "ram": {"capacity": 1073741824}} + # Start of allocation process + bt.logging.info(f"API: Starting container allocation with hotkey: {hotkey}") + + bt.logging.trace(f"Docker Requirement: {docker_requirement}") + # Instantiate the connection to the db for axon in self.metagraph.axons: if axon.hotkey == hotkey: - check_allocation = self.dendrite.query( - axon, - Allocate( - timeline=timeline, - device_requirement=device_requirement, - checking=True, - ), - timeout=60, - ) - if check_allocation and check_allocation["status"] is True: - register_response = self.dendrite.query( + attempt = 0 + # Retry allocation up to max_retries times + + while attempt < MAX_ALLOCATION_RETRY: + attempt += 1 + check_allocation = self.dendrite.query( axon, Allocate( timeline=timeline, device_requirement=device_requirement, - checking=False, - public_key=public_key, - docker_requirement=docker_requirement, + checking=True, ), - timeout=100, + timeout=30, ) - if register_response and register_response["status"] is True: - register_response["ip"] = axon.ip - register_response["hotkey"] = axon.hotkey - return register_response + + if not check_allocation or check_allocation.get("status") is not True: + bt.logging.warning(f"API: Allocation check failed for hotkey: {hotkey}") + continue # Move to the next axon if allocation check failed + + bt.logging.info(f"API: Allocation check passed for hotkey: {hotkey}") + + if check_allocation and check_allocation["status"] is True: + register_response = self.dendrite.query( + axon, + Allocate( + timeline=60, + device_requirement=device_requirement, + checking=False, + public_key=public_key, + docker_requirement=docker_requirement, + ), + timeout=60, + ) + if register_response and register_response["status"] is True: + register_response["ip"] = axon.ip + register_response["hotkey"] = axon.hotkey + return register_response + + # Log or print retry attempt (optional) + bt.logging.trace(f"Attempt {attempt} failed for hotkey {hotkey}, retrying...") if attempt < MAX_ALLOCATION_RETRY else None + time.sleep(10) # Sleep before the next retry attempt return {"status": False, "msg": "Requested resource is not available."} @@ -2856,40 +2931,43 @@ async def _check_allocation(self): for row in rows: id, hotkey, details = row info = json.loads(details) - - index = self.metagraph.hotkeys.index(hotkey) - axon = self.metagraph.axons[index] uuid_key = info.get("uuid") - register_response = await run_in_threadpool(self.dendrite.query, axon, + # Check if hotkey exists in self.metagraph.hotkeys and uuid_key is valid + if hotkey in self.metagraph.hotkeys and uuid_key: + index = self.metagraph.hotkeys.index(hotkey) + axon = self.metagraph.axons[index] + + register_response = await run_in_threadpool(self.dendrite.query, axon, Allocate(timeline=1, checking=True, ), timeout=60) - if register_response and register_response["status"] is False: + if register_response and register_response["status"] is False: + + if hotkey in self.checking_allocated: + response = await self._notify_allocation_status( + event_time=deallocated_at, + hotkey=hotkey, + uuid=uuid_key, + event="ONLINE", + details=f"GPU Resume for {ALLOCATE_CHECK_PERIOD} seconds" + ) + bt.logging.info(f"API: Allocation ONLINE notification for hotkey: {hotkey}") + self.checking_allocated = [x for x in self.checking_allocated if x != hotkey] - if hotkey in self.checking_allocated: + else: + # handle the case when no response is received or the docker is not running + self.checking_allocated.append(hotkey) + # bt.logging.info(f"API: No response timeout is triggered for hotkey: {hotkey}") + deallocated_at = datetime.now(timezone.utc) response = await self._notify_allocation_status( event_time=deallocated_at, hotkey=hotkey, uuid=uuid_key, - event="ONLINE", - details=f"GPU Resume for {ALLOCATE_CHECK_PERIOD} seconds" + event="OFFLINE", + details=f"No response timeout for {ALLOCATE_CHECK_PERIOD} seconds" ) - self.checking_allocated = [x for x in self.checking_allocated if x != hotkey] - - # bt.logging.info(f"API: Allocation is still running for hotkey: {hotkey}") - else: - # handle the case when no response is received or the docker is not running - self.checking_allocated.append(hotkey) - # bt.logging.info(f"API: No response timeout is triggered for hotkey: {hotkey}") - deallocated_at = datetime.now(timezone.utc) - response = await self._notify_allocation_status( - event_time=deallocated_at, - hotkey=hotkey, - uuid=uuid_key, - event="OFFLINE", - details=f"No response timeout for {ALLOCATE_CHECK_PERIOD} seconds" - ) - if not response: - pass + bt.logging.info(f"API: Allocation OFFLINE notification for hotkey: {hotkey}") + if not response: + pass if self.checking_allocated.count(hotkey) >= ALLOCATE_CHECK_COUNT: deallocated_at = datetime.now(timezone.utc) @@ -2965,6 +3043,30 @@ def check_port_open(host, port, hotkey): bt.logging.warning(f"API: Could not determine status of port {port} on {host} for {hotkey}") return False + def miner_is_older_than(self, db: ComputeDb, hours: int, ss58_address: str) -> bool: + cursor = db.get_cursor() + try: + cursor.execute("SELECT MIN(created_at) FROM challenge_details WHERE ss58_address = ?", (ss58_address,)) + oldest_timestamp = cursor.fetchone()[0] + if oldest_timestamp: + if (datetime.now() - datetime.fromisoformat(oldest_timestamp)).total_seconds() <= hours * 3600: + print(f"Hotkey not old enough: {ss58_address}") + return False + return True + return False + except Exception as e: + bt.logging.info(f"Error occurred: {e}") + return False + finally: + cursor.close() + + def miner_pog_ok(self, db: ComputeDb, hours: int, ss58_address: str) -> bool: + gpu_specs = get_pog_specs(self.db, ss58_address) + if gpu_specs is not None: + return True + else: + return False + def run(self): """ Run the FastAPI app.
@@ -3005,4 +3107,4 @@ def stop(self): if __name__ == "__main__": os.environ["WANDB_SILENT"] = "true" register_app = RegisterAPI() - register_app.run() + register_app.run() \ No newline at end of file diff --git a/neurons/validator.py b/neurons/validator.py index 37d2199..2057569 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -25,34 +25,34 @@ import threading import traceback import hashlib +import numpy as np +import yaml +import multiprocessing from asyncio import AbstractEventLoop from typing import Dict, Tuple, List import bittensor as bt import math import time +import paramiko -from compute.utils.socket import check_port import cryptography import torch from cryptography.fernet import Fernet -from torch._C._te import Tensor +from torch._C._te import Tensor # type: ignore import RSAEncryption as rsa -from neurons.Validator.script import check_ssh_login +from neurons.Validator.database.pog import get_pog_specs, update_pog_stats +import concurrent.futures +from collections import defaultdict import Validator.app_generator as ag -from Validator.pow import gen_hash, run_validator_pow from compute import ( - pow_min_difficulty, - pow_max_difficulty, - pow_timeout, SUSPECTED_EXPLOITERS_HOTKEYS, SUSPECTED_EXPLOITERS_COLDKEYS, __version_as_int__, validator_permit_stake, - weights_rate_limit, - specs_timeout, -) + weights_rate_limit + ) from compute.axon import ComputeSubnetSubtensor from compute.protocol import Allocate, Challenge, Specs from compute.utils.db import ComputeDb @@ -61,10 +61,11 @@ from compute.utils.subtensor import is_registered, get_current_block, calculate_next_block_time from compute.utils.version import try_update, get_local_version, version2number, get_remote_version from compute.wandb.wandb import ComputeWandb -from neurons.Validator.calculate_pow_score import calc_score +from neurons.Validator.calculate_pow_score import calc_score_pog from neurons.Validator.database.allocate import update_miner_details, select_has_docker_miners_hotkey, get_miner_details from neurons.Validator.database.challenge import select_challenge_stats, update_challenge_details from neurons.Validator.database.miner import select_miners, purge_miner_entries, update_miners +from neurons.Validator.pog import adjust_matrix_size, compute_script_hash, execute_script_on_miner, get_random_seeds, load_yaml_config, parse_merkle_output, receive_responses, send_challenge_indices, send_script_and_request_hash, parse_benchmark_output, identify_gpu, send_seeds, verify_merkle_proof_row, get_remote_gpu_info, verify_responses class Validator: @@ -103,7 +104,7 @@ def dendrite(self) -> bt.dendrite: return self._dendrite @property - def metagraph(self) -> bt.metagraph: + def metagraph(self) -> bt.metagraph: # type: ignore return self._metagraph @property @@ -181,6 +182,17 @@ def __init__(self): # Initialize wandb self.wandb = ComputeWandb(self.config, self.wallet, os.path.basename(__file__)) + # STEP 2B: Init Proof of GPU + # Load configuration from YAML + config_file = "config.yaml" + self.config_data = load_yaml_config(config_file) + cpu_cores = os.cpu_count() or 1 + configured_max_workers = self.config_data["merkle_proof"].get("max_workers", 32) + safe_max_workers = min((cpu_cores + 4)*4, configured_max_workers) + self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=safe_max_workers) + self.results = {} + self.gpu_task = None # Track the GPU task + # Step 3: Set up initial scoring weights for validation bt.logging.info("Building validation weights.") self.uids: list = self.metagraph.uids.tolist() @@ -196,6 +208,9 @@ def __init__(self): # Initialize penalized_hotkeys as an empty list self.penalized_hotkeys = [] + # Initialize penalized_hotkeys_checklist as an empty list + self.penalized_hotkeys_checklist = [] + # Init the thread. self.lock = threading.Lock() self.threads: List[threading.Thread] = [] @@ -275,14 +290,39 @@ def pretty_print_dict_values(items: dict): bt.logging.trace(log) + def update_allocation_wandb(self): + hotkey_list = [] + # Instantiate the connection to the db + cursor = self.db.get_cursor() + try: + # Retrieve all records from the allocation table + cursor.execute("SELECT id, hotkey, details FROM allocation") + rows = cursor.fetchall() + for row in rows: + id, hotkey, details = row + hotkey_list.append(hotkey) + except Exception as e: + bt.logging.info(f"An error occurred while retrieving allocation details: {e}") + finally: + cursor.close() + + # Update wandb + try: + self.wandb.update_allocated_hotkeys(hotkey_list) + except Exception as e: + bt.logging.info(f"Error updating wandb : {e}") + def sync_scores(self): # Fetch scoring stats self.stats = select_challenge_stats(self.db) valid_validator_hotkeys = self.get_valid_validator_hotkeys() + self.update_allocation_wandb() + # Fetch allocated hotkeys self.allocated_hotkeys = self.wandb.get_allocated_hotkeys(valid_validator_hotkeys, True) + # bt.logging.info(f"Allocated hotkeys: {self.allocated_hotkeys}") # Fetch penalized hotkeys self.penalized_hotkeys = self.wandb.get_penalized_hotkeys(valid_validator_hotkeys, True) @@ -310,14 +350,20 @@ def sync_scores(self): else: self.stats[uid]["has_docker"] = False - score = calc_score(self.stats[uid], hotkey, self.allocated_hotkeys, self.penalized_hotkeys, valid_validator_hotkeys) + gpu_specs = get_pog_specs(self.db, hotkey) + + if gpu_specs is not None: + score = calc_score_pog(gpu_specs, hotkey, self.allocated_hotkeys, self.config_data) + else: + score = 0 + self.stats[uid]["score"] = score except KeyError as e: - # bt.logging.info(f"KeyError occurred for UID {uid}: {str(e)}") + bt.logging.trace(f"KeyError occurred for UID {uid}: {str(e)}") score = 0 except Exception as e: - # bt.logging.info(f"An unexpected exception occurred for UID {uid}: {str(e)}") + bt.logging.trace(f"An unexpected exception occurred for UID {uid}: {str(e)}") score = 0 self.scores[uid] = score @@ -354,48 +400,6 @@ def sync_status(self): current_version = __version_as_int__ if subnet_prometheus_version != current_version: self.init_prometheus(force_update=True) - def remove_duplicate_penalized_hotkeys(self): - """ - Removes any duplicate entries in the penalized_hotkeys_checklist - based on the 'hotkey' field. - """ - seen = set() - unique_penalized_list = [] - - for item in self.penalized_hotkeys_checklist: - if item['hotkey'] not in seen: - unique_penalized_list.append(item) - seen.add(item['hotkey']) - - self.penalized_hotkeys_checklist = unique_penalized_list - bt.logging.info("Removed duplicate hotkeys from penalized_hotkeys_checklist.") - - def sync_checklist(self): - self.threads = [] - self.penalized_hotkeys_checklist = self.wandb.get_penalized_hotkeys_checklist(self.get_valid_validator_hotkeys(), True) - for i in range(0, len(self.uids), self.validator_challenge_batch_size): - for _uid in self.uids[i : i + self.validator_challenge_batch_size]: - try: - axon = self._queryable_uids[_uid] - self.threads.append( - threading.Thread( - target=self.execute_miner_checking_request, - args=(_uid, axon), - name=f"th_execute_miner_checking_request-{_uid}", - daemon=True, - ) - ) - except KeyError: - continue - self.remove_duplicate_penalized_hotkeys() - - for thread in self.threads: - thread.start() - - for thread in self.threads: - thread.join() - - self.wandb.update_penalized_hotkeys_checklist(self.penalized_hotkeys_checklist) def sync_miners_info(self, queryable_tuple_uids_axons: List[Tuple[int, bt.AxonInfo]]): if queryable_tuple_uids_axons: @@ -412,32 +416,6 @@ def sync_miners_info(self, queryable_tuple_uids_axons: List[Tuple[int, bt.AxonIn else: bt.logging.warning(f"❌ No queryable miners.") - def calc_difficulty(self, uid): - difficulty = pow_min_difficulty - try: - stat = self.stats[uid] - current_difficulty = math.ceil(force_to_float_or_default(stat.get("last_20_difficulty_avg"), default=pow_min_difficulty)) - last_20_challenge_failed = force_to_float_or_default(stat.get("last_20_challenge_failed")) - challenge_successes = force_to_float_or_default(stat.get("challenge_successes")) - - # Adjust difficulty based on failure rates with more nuanced increments - if challenge_successes > 4: # Adjusts the threshold from 20 to 4 for faster response - failure_rate = last_20_challenge_failed / 20 - if failure_rate < 0.1: - difficulty = min(current_difficulty + 2, pow_max_difficulty) - elif failure_rate < 0.2: - difficulty = min(current_difficulty + 1, pow_max_difficulty) - elif failure_rate > 0.25: - difficulty = max(current_difficulty - 1, pow_min_difficulty) - else: - difficulty = current_difficulty - except KeyError: - pass - except Exception as e: - bt.logging.error(f"{e} => difficulty minimal: {pow_min_difficulty} attributed for {uid}") - - return max(difficulty, pow_min_difficulty) - @staticmethod def filter_axons(queryable_tuple_uids_axons: List[Tuple[int, bt.AxonInfo]]): """Filter the axons with uids_list, remove those with the same IP address.""" @@ -495,7 +473,6 @@ def is_blacklisted(self, neuron: bt.NeuronInfoLite): # Add the coldkey attached to this hotkey in the blacklisted coldkeys self.exploiters_hotkeys.add(coldkey) return True - return False def get_valid_tensors(self, metagraph): @@ -546,179 +523,465 @@ def get_valid_validator_hotkeys(self): valid_hotkeys.append(hotkey) return valid_hotkeys - def execute_pow_request(self, uid, axon: bt.AxonInfo, _hash, _salt, mode, chars, mask, difficulty): - dendrite = bt.dendrite(wallet=self.wallet) - start_time = time.time() - bt.logging.info(f"Querying for {Challenge.__name__} - {uid}/{axon.hotkey}/{_hash}/{difficulty}") - response = dendrite.query( - axon, - Challenge( - challenge_hash=_hash, - challenge_salt=_salt, - challenge_mode=mode, - challenge_chars=chars, - challenge_mask=mask, - challenge_difficulty=difficulty, - ), - timeout=pow_timeout, - ) - elapsed_time = time.time() - start_time - response_password = response.get("password", "") - hashed_response = gen_hash(response_password, _salt)[0] if response_password else "" - success = True if _hash == hashed_response else False - result_data = { - "ss58_address": axon.hotkey, - "success": success, - "elapsed_time": elapsed_time, - "difficulty": difficulty, - } - with self.lock: - self.pow_responses[uid] = response - self.new_pow_benchmark[uid] = result_data - - def execute_miner_checking_request(self, uid, axon: bt.AxonInfo): - dendrite = bt.dendrite(wallet=self.wallet) - bt.logging.info(f"Querying for {Allocate.__name__} - {uid}/{axon.hotkey}") - - response = dendrite.query(axon, Allocate(timeline=1, checking=True), timeout=30) - port = response.get("port", "") - status = response.get("status", False) - checklist_hotkeys = [item['hotkey'] for item in self.penalized_hotkeys_checklist] - if port: - if not status: - is_port_open = check_port(axon.ip, port) - if (axon.hotkey not in checklist_hotkeys )and (not is_port_open): - self.penalized_hotkeys_checklist.append({"hotkey": axon.hotkey, "status_code": "PORT_CLOSED", "description": "The port of ssh server is closed"}) - bt.logging.info( - f"Debug {Allocate.__name__} - status of Checking allocation - {status} {uid} - Port is closed and not usable, even though it has been allocated." - ) - else: - bt.logging.info(f"Debug {Allocate.__name__} - status of Checking allocation - {status} {uid} - Port is open and the miner is allocated") + def get_specs_wandb(self): + """ + Retrieves hardware specifications from Wandb, updates the miner_details table, + and checks for differences in GPU specs, logging changes only for allocated hotkeys. + """ + bt.logging.info(f"💻 Hardware list of uids queried (Wandb): {list(self._queryable_uids.keys())}") + + # Retrieve specs from Wandb + specs_dict = self.wandb.get_miner_specs(self._queryable_uids) + + # Fetch current specs from miner_details using the existing function + current_miner_details = get_miner_details(self.db) + + # Compare and detect GPU spec changes for allocated hotkeys + for hotkey, new_specs in specs_dict.values(): + if hotkey in self.allocated_hotkeys: # Check if hotkey is allocated + current_specs = current_miner_details.get(hotkey, {}) + current_gpu_specs = current_specs.get("gpu", {}) + new_gpu_specs = new_specs.get("gpu", {}) + + # Extract the count values + current_count = current_gpu_specs.get("count", 0) + new_count = new_gpu_specs.get("count", 0) + + # Initialize names to None by default + current_name = None + new_name = None + + # Retrieve the current name if details are present and non-empty + current_details = current_gpu_specs.get("details", []) + if isinstance(current_details, list) and len(current_details) > 0: + current_name = current_details[0].get("name") + + # Retrieve the new name if details are present and non-empty + new_details = new_gpu_specs.get("details", []) + if isinstance(new_details, list) and len(new_details) > 0: + new_name = new_details[0].get("name") + + # Compare only count and name + if current_count != new_count or current_name != new_name: + axon = None + for uid, axon_info in self._queryable_uids.items(): + if axon_info.hotkey == hotkey: + axon = axon_info + break + + if axon: + bt.logging.info(f"GPU specs changed for allocated hotkey {hotkey}:") + bt.logging.info(f"Old count: {current_count}, Old name: {current_name}") + bt.logging.info(f"New count: {new_count}, New name: {new_name}") + self.deallocate_miner(axon, None) + + # Update the local db with the new data from Wandb + update_miner_details(self.db, list(specs_dict.keys()), list(specs_dict.values())) + + # Log the hotkey and specs + bt.logging.info(f"✅ GPU specs per hotkey (Wandb):") + for hotkey, specs in specs_dict.values(): + gpu_info = specs.get("gpu", {}) + gpu_details = gpu_info.get("details", []) + if gpu_details: + gpu_name = gpu_details[0].get("name", "Unknown GPU") + gpu_count = gpu_info.get("count", 1) # Assuming 'count' reflects the number of GPUs + bt.logging.info(f"{hotkey}: {gpu_name} x {gpu_count}") else: - is_ssh_access = True - allocation_status = False - private_key, public_key = rsa.generate_key_pair() - device_requirement = {"cpu": {"count": 1}, "gpu": {}, "hard_disk": {"capacity": 1073741824}, "ram": {"capacity": 1073741824}} - try: - response = dendrite.query(axon, Allocate(timeline=1, device_requirement=device_requirement, checking=False, public_key=public_key), timeout=60) - if response and response["status"] is True: - allocation_status = True - bt.logging.info(f"Debug {Allocate.__name__} - Successfully Allocated - {uid}") - private_key = private_key.encode("utf-8") - decrypted_info_str = rsa.decrypt_data(private_key, base64.b64decode(response["info"])) - info = json.loads(decrypted_info_str) - is_ssh_access = check_ssh_login(axon.ip, port, info['username'], info['password']) - except Exception as e: - bt.logging.error(f"{e}") - while True and allocation_status: - deregister_response = dendrite.query(axon, Allocate(timeline=0, checking=False, public_key=public_key), timeout=60) - if deregister_response and deregister_response["status"] is True: - bt.logging.info(f"Debug {Allocate.__name__} - Deallocated - {uid}") - break - else: - bt.logging.error(f"Debug {Allocate.__name__} - Failed to deallocate - {uid} will retry in 5 seconds") - time.sleep(5) - if axon.hotkey in checklist_hotkeys: - self.penalized_hotkeys_checklist = [item for item in self.penalized_hotkeys_checklist if item['hotkey'] != axon.hotkey] - if not is_ssh_access: - bt.logging.info(f"Debug {Allocate.__name__} - status of Checking allocation - {status} {uid} - SSH access is disabled") - self.penalized_hotkeys_checklist.append({"hotkey": axon.hotkey, "status_code": "SSH_ACCESS_DISABLED", "description": "It can not access to the server via ssh"}) - - def execute_specs_request(self): - if len(self.queryable_for_specs) > 0: - return - else: - # Miners to query this block - self.queryable_for_specs = self.queryable.copy() - - bt.logging.info(f"💻 Initialisation of the {Specs.__name__} queries...") - # # Prepare app_data for benchmarking - # # Generate secret key for app - secret_key = Fernet.generate_key() - cipher_suite = Fernet(secret_key) - # # Compile the script and generate an exe. - ag.run(secret_key) + bt.logging.info(f"{hotkey}: No GPU details available") + + self.finalized_specs_once = True + + async def proof_of_gpu(self): + """ + Perform Proof-of-GPU benchmarking on allocated miners without overlapping tests. + Uses asyncio with ThreadPoolExecutor to test miners in parallel. + """ try: - main_dir = os.path.dirname(os.path.abspath(__file__)) - file_name = os.path.join(main_dir, "Validator/dist/script") - # Read the exe file and save it to app_data. - with open(file_name, "rb") as file: - # Read the entire content of the EXE file - app_data = file.read() - except Exception as e: - bt.logging.error(f"{e}") - return - - results = {} - while len(self.queryable_for_specs) > 0: - uids = list(self.queryable_for_specs.keys()) - queryable_for_specs_uids = random.sample(uids, self.validator_specs_batch_size) if len(uids) > self.validator_specs_batch_size else uids - queryable_for_specs_uid = [] - queryable_for_specs_axon = [] - queryable_for_specs_hotkey = [] - - for uid, axon in self.queryable_for_specs.items(): - if uid in queryable_for_specs_uids: - queryable_for_specs_uid.append(uid) - queryable_for_specs_axon.append(axon) - queryable_for_specs_hotkey.append(axon.hotkey) - - for uid in queryable_for_specs_uids: - del self.queryable_for_specs[uid] + self._queryable_uids = self.get_queryable() + + # Settings + merkle_proof = self.config_data["merkle_proof"] + retry_limit = merkle_proof.get("pog_retry_limit",30) + retry_interval = merkle_proof.get("pog_retry_interval",75) + num_workers = merkle_proof.get("max_workers",32) + max_delay = merkle_proof.get("max_random_delay",1200) + + # Random delay for PoG + delay = random.uniform(0, max_delay) # Random delay + bt.logging.info(f"💻⏳ Scheduled Proof-of-GPU task to start in {delay:.2f} seconds.") + await asyncio.sleep(delay) + + bt.logging.info(f"💻 Starting Proof-of-GPU benchmarking for uids: {list(self._queryable_uids.keys())}") + # Shared dictionary to store results + self.results = {} + # Dictionary to track retry counts + retry_counts = defaultdict(int) + # Queue of miners to process + queue = asyncio.Queue() + + # Initialize the queue with initial miners + for i in range(0, len(self.uids), self.validator_challenge_batch_size): + for _uid in self.uids[i : i + self.validator_challenge_batch_size]: + try: + axon = self._queryable_uids[_uid] + if axon.hotkey in self.allocated_hotkeys: + bt.logging.info(f"Skipping allocated miner: {axon.hotkey}") + continue # skip this miner since it's allocated + await queue.put(axon) + except KeyError: + continue - try: - # Query the miners for benchmarking - bt.logging.info(f"💻 Hardware list of uids queried: {queryable_for_specs_uid}") - responses = self.dendrite.query(queryable_for_specs_axon, Specs(specs_input=repr(app_data)), timeout=specs_timeout) + # Initialize a single Lock for thread-safe updates to results + results_lock = asyncio.Lock() - # Format responses and save them to benchmark_responses - for index, response in enumerate(responses): + async def worker(): + while True: try: - if response: - binary_data = ast.literal_eval(response) # Convert str to binary data - decrypted = cipher_suite.decrypt(binary_data) # Decrypt str to binary data - decoded_data = json.loads(decrypted.decode()) # Convert data to object - results[queryable_for_specs_uid[index]] = (queryable_for_specs_hotkey[index], decoded_data) + axon = await queue.get() + except asyncio.CancelledError: + break + hotkey = axon.hotkey + try: + result = await asyncio.get_event_loop().run_in_executor( + self.executor, self.test_miner_gpu, axon, self.config_data + ) + if result[1] is not None and result[2] > 0: + async with results_lock: + self.results[hotkey] = { + "gpu_name": result[1], + "num_gpus": result[2] + } + update_pog_stats(self.db, hotkey, result[1], result[2]) + else: + raise RuntimeError("GPU test failed") + except Exception as e: + bt.logging.trace(f"Exception in worker for {hotkey}: {e}") + retry_counts[hotkey] += 1 + if retry_counts[hotkey] < retry_limit: + bt.logging.info(f"🔄 {hotkey}: Retrying miner -> (Attempt {retry_counts[hotkey]})") + await asyncio.sleep(retry_interval) + await queue.put(axon) else: - results[queryable_for_specs_uid[index]] = (queryable_for_specs_hotkey[index], {}) - except cryptography.fernet.InvalidToken: - bt.logging.warning(f"{queryable_for_specs_hotkey[index]} - InvalidToken") - results[queryable_for_specs_uid[index]] = (queryable_for_specs_hotkey[index], {}) - except Exception as _: - traceback.print_exc() - results[queryable_for_specs_uid[index]] = (queryable_for_specs_hotkey[index], {}) + bt.logging.info(f"❌ {hotkey}: Miner failed after {retry_limit} attempts.") + update_pog_stats(self.db, hotkey, None, None) + finally: + queue.task_done() - except Exception as e: - traceback.print_exc() - update_miner_details(self.db, list(results.keys()), list(results.values())) - bt.logging.info(f"✅ Hardware list responses:") + # Number of concurrent workers + # Determine a safe default number of workers + cpu_cores = os.cpu_count() or 1 + safe_max_workers = min((cpu_cores + 4)*4, num_workers) + + workers = [asyncio.create_task(worker()) for _ in range(safe_max_workers)] + bt.logging.trace(f"Started {safe_max_workers} worker tasks for Proof-of-GPU benchmarking.") - # Hardware list response hotfix 1.3.11 - db = ComputeDb() - hardware_details = get_miner_details(db) - for hotkey, specs in hardware_details.items(): - bt.logging.info(f"{hotkey} - {specs}") + # Wait until the queue is fully processed + await queue.join() + + # Cancel worker tasks + for w in workers: + w.cancel() + # Wait until all worker tasks are cancelled + await asyncio.gather(*workers, return_exceptions=True) + + bt.logging.success(f"✅ Proof-of-GPU benchmarking completed.") + return self.results + except Exception as e: + bt.logging.info(f"❌ Exception in proof_of_gpu: {e}\n{traceback.format_exc()}") + + def on_gpu_task_done(self, task): + try: + results = task.result() + bt.logging.trace(f"Proof-of-GPU Results: {results}") + self.gpu_task = None # Reset the task reference + self.sync_scores() + + except Exception as e: + bt.logging.error(f"Proof-of-GPU task failed: {e}") + self.gpu_task = None + + def test_miner_gpu(self, axon, config_data): """ - for hotkey, specs in results.values(): - bt.logging.info(f"{hotkey} - {specs}") + Allocate, test, and deallocate a single miner. + + :return: Tuple of (miner_hotkey, gpu_name, num_gpus) """ - self.finalized_specs_once = True + allocation_status = False + miner_info = None + host = None # Initialize host variable + hotkey = axon.hotkey + bt.logging.trace(f"{hotkey}: Starting miner test.") - def get_specs_wandb(self): + try: + # Step 0: Init + gpu_data = config_data["gpu_performance"] + gpu_tolerance_pairs = gpu_data.get("gpu_tolerance_pairs", {}) + # Extract Merkle Proof Settings + merkle_proof = config_data["merkle_proof"] + time_tol = merkle_proof.get("time_tolerance",5) + # Extract miner_script path + miner_script_path = merkle_proof["miner_script_path"] + + # Step 1: Allocate Miner + # Generate RSA key pair + private_key, public_key = rsa.generate_key_pair() + allocation_response = self.allocate_miner(axon, private_key, public_key) + if not allocation_response: + bt.logging.info(f"🌀 {hotkey}: Busy or not allocatable.") + return (hotkey, None, 0) + allocation_status = True + miner_info = allocation_response + host = miner_info['host'] + bt.logging.trace(f"{hotkey}: Allocated Miner for testing.") + + # Step 2: Connect via SSH + ssh_client = paramiko.SSHClient() + ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + bt.logging.trace(f"{hotkey}: Connect to Miner via SSH.") + ssh_client.connect(host, port=miner_info.get('port', 22), username=miner_info['username'], password=miner_info['password'], timeout=10) + if not (ssh_client): + ssh_client.close() + bt.logging.info(f"{hotkey}: SSH connection failed.") + return (hotkey, None, -1) + bt.logging.trace(f"{hotkey}: Connected to Miner via SSH.") + + # Step 3: Hash Check + local_hash = compute_script_hash(miner_script_path) + bt.logging.trace(f"{hotkey}: [Step 1] Local script hash computed successfully.") + bt.logging.trace(f"{hotkey}: Local Hash: {local_hash}") + remote_hash = send_script_and_request_hash(ssh_client, miner_script_path) + if local_hash != remote_hash: + bt.logging.info(f"{hotkey}: [Integrity Check] FAILURE: Hash mismatch detected.") + raise ValueError(f"{hotkey}: Script integrity verification failed.") + + # Step 4: Get GPU info NVIDIA from the remote miner + bt.logging.trace(f"{hotkey}: [Step 4] Retrieving GPU information (NVIDIA driver) from miner...") + gpu_info = get_remote_gpu_info(ssh_client) + num_gpus_reported = gpu_info["num_gpus"] + gpu_name_reported = gpu_info["gpu_names"][0] if num_gpus_reported > 0 else None + bt.logging.trace(f"{hotkey}: [Step 4] Reported GPU Information:") + if num_gpus_reported > 0: + bt.logging.trace(f"{hotkey}: Number of GPUs: {num_gpus_reported}") + bt.logging.trace(f"{hotkey}: GPU Type: {gpu_name_reported}") + if num_gpus_reported <= 0: + bt.logging.info(f"{hotkey}: No GPUs detected.") + raise ValueError("No GPUs detected.") + + # Step 5: Run the benchmarking mode + bt.logging.info(f"💻 {hotkey}: Executing benchmarking mode.") + bt.logging.trace(f"{hotkey}: [Step 5] Executing benchmarking mode on the miner...") + execution_output = execute_script_on_miner(ssh_client, mode='benchmark') + bt.logging.trace(f"{hotkey}: [Step 5] Benchmarking completed.") + # Parse the execution output + num_gpus, vram, size_fp16, time_fp16, size_fp32, time_fp32 = parse_benchmark_output(execution_output) + bt.logging.trace(f"{hotkey}: [Benchmark Results] Detected {num_gpus} GPU(s) with {vram} GB unfractured VRAM.") + bt.logging.trace(f"{hotkey}: FP16 - Matrix Size: {size_fp16}, Execution Time: {time_fp16} s") + bt.logging.trace(f"{hotkey}: FP32 - Matrix Size: {size_fp32}, Execution Time: {time_fp32} s") + # Calculate performance metrics + fp16_tflops = (2 * size_fp16 ** 3) / time_fp16 / 1e12 + fp32_tflops = (2 * size_fp32 ** 3) / time_fp32 / 1e12 + bt.logging.trace(f"{hotkey}: [Performance Metrics] Calculated TFLOPS:") + bt.logging.trace(f"{hotkey}: FP16: {fp16_tflops:.2f} TFLOPS") + bt.logging.trace(f"{hotkey}: FP32: {fp32_tflops:.2f} TFLOPS") + gpu_name = identify_gpu(fp16_tflops, fp32_tflops, vram, gpu_data, gpu_name_reported, gpu_tolerance_pairs) + bt.logging.trace(f"{hotkey}: [GPU Identification] Based on performance: {gpu_name}") + + # Step 6: Run the Merkle proof mode + bt.logging.trace(f"{hotkey}: [Step 6] Initiating Merkle Proof Mode.") + # Step 1: Send seeds and execute compute mode + n = adjust_matrix_size(vram, element_size=4, buffer_factor=0.10) + seeds = get_random_seeds(num_gpus) + send_seeds(ssh_client, seeds, n) + bt.logging.trace(f"{hotkey}: [Step 6] Compute mode executed on miner - Matrix Size: {n}") + start_time = time.time() + execution_output = execute_script_on_miner(ssh_client, mode='compute') + end_time = time.time() + elapsed_time = end_time - start_time + bt.logging.trace(f"{hotkey}: Compute mode execution time: {elapsed_time:.2f} seconds.") + # Parse the execution output + root_hashes_list, gpu_timings_list = parse_merkle_output(execution_output) + bt.logging.trace(f"{hotkey}: [Merkle Proof] Root hashes received from GPUs:") + for gpu_id, root_hash in root_hashes_list: + bt.logging.trace(f"{hotkey}: GPU {{gpu_id}}: {{root_hash}}") + + # Calculate total times + total_multiplication_time = 0.0 + total_merkle_tree_time = 0.0 + num_gpus = len(gpu_timings_list) + for _, timing in gpu_timings_list: + total_multiplication_time += timing.get('multiplication_time', 0.0) + total_merkle_tree_time += timing.get('merkle_tree_time', 0.0) + average_multiplication_time = total_multiplication_time / num_gpus if num_gpus > 0 else 0.0 + average_merkle_tree_time = total_merkle_tree_time / num_gpus if num_gpus > 0 else 0.0 + bt.logging.trace(f"{hotkey}: Average Matrix Multiplication Time: {average_multiplication_time:.4f} seconds") + bt.logging.trace(f"{hotkey}: Average Merkle Tree Time: {average_merkle_tree_time:.4f} seconds") + + timing_passed = False + if elapsed_time < time_tol + num_gpus * time_fp32 and average_multiplication_time < time_fp32: + timing_passed = True + + # Step 7: Verify merkle proof + root_hashes = {gpu_id: root_hash for gpu_id, root_hash in root_hashes_list} + gpu_timings = {gpu_id: timing for gpu_id, timing in gpu_timings_list} + n = gpu_timings[0]['n'] # Assuming same n for all GPUs + indices = {} + num_indices = 1 + for gpu_id in range(num_gpus): + indices[gpu_id] = [(np.random.randint(0, n), np.random.randint(0, n)) for _ in range(num_indices)] + send_challenge_indices(ssh_client, indices) + execution_output = execute_script_on_miner(ssh_client, mode='proof') + bt.logging.trace(f"{hotkey}: [Merkle Proof] Proof mode executed on miner.") + responses = receive_responses(ssh_client, num_gpus) + bt.logging.trace(f"{hotkey}: [Merkle Proof] Responses received from miner.") + + verification_passed = verify_responses(seeds, root_hashes, responses, indices, n) + if verification_passed and timing_passed: + bt.logging.info(f"✅ {hotkey}: GPU Identification: Detected {num_gpus} x {gpu_name} GPU(s)") + return (hotkey, gpu_name, num_gpus) + else: + bt.logging.info(f"⚠️ {hotkey}: GPU Identification: Aborted due to verification failure") + return (hotkey, None, 0) - bt.logging.info(f"💻 Hardware list of uids queried (Wandb): {list(self._queryable_uids.keys())}") + except Exception as e: + bt.logging.info(f"❌ {hotkey}: Error testing Miner: {e}") + return (hotkey, None, 0) - specs_dict = self.wandb.get_miner_specs(self._queryable_uids) - # Update the local db with the data from wandb - update_miner_details(self.db, list(specs_dict.keys()), list(specs_dict.values())) + finally: + if allocation_status and miner_info: + self.deallocate_miner(axon, public_key) - # Log the hotkey and specs - bt.logging.info(f"✅ Hardware list responses:") - for hotkey, specs in specs_dict.values(): - bt.logging.info(f"{hotkey} - {specs}") + def allocate_miner(self, axon, private_key, public_key): + """ + Allocate a miner by querying the allocator. - self.finalized_specs_once = True + :param uid: Unique identifier for the axon. + :param axon: Axon object containing miner details. + :return: Dictionary with miner details if successful, None otherwise. + """ + try: + dendrite = bt.dendrite(wallet=self.wallet) + + # Define device requirements (customize as needed) + device_requirement = {"cpu": {"count": 1}, "gpu": {}, "hard_disk": {"capacity": 1073741824}, "ram": {"capacity": 1073741824}} + device_requirement["gpu"] = {"count": 1, "capacity": 0, "type": ""} + + docker_requirement = { + "base_image": "pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime", + } + + # Simulate an allocation query with Allocate + check_allocation = dendrite.query( + axon, + Allocate(timeline=1, device_requirement=device_requirement, checking=True), + timeout=30, + ) + if check_allocation and check_allocation ["status"] is True: + response = dendrite.query( + axon, + Allocate( + timeline=1, + device_requirement=device_requirement, + checking=False, + public_key=public_key, + docker_requirement=docker_requirement, + ), + timeout=30, + ) + if response and response.get("status") is True: + bt.logging.trace(f"Successfully allocated miner {axon.hotkey}") + decrypted_info_str = rsa.decrypt_data( + private_key.encode("utf-8"), + base64.b64decode(response["info"]), + ) + info = json.loads(decrypted_info_str) + + miner_info = { + 'host': axon.ip, + 'port': info['port'], + 'username': info['username'], + 'password': info['password'], + } + return miner_info + else: + bt.logging.trace(f"{axon.hotkey}: Miner allocation failed or no response received.") + return None + else: + bt.logging.trace(f"{axon.hotkey}: Miner aready allocated or no response received.") + return None + + except Exception as e: + bt.logging.trace(f"{axon.hotkey}: Exception during miner allocation for: {e}") + return None + + def deallocate_miner(self, axon, public_key): + """ + Deallocate a miner by sending a deregistration query. + + :param axon: Axon object containing miner details. + :param public_key: Public key of the miner; if None, it will be retrieved from the database. + """ + if not public_key: + try: + # Instantiate the connection to the database and retrieve miner details + db = ComputeDb() + cursor = db.get_cursor() + + cursor.execute( + "SELECT details, hotkey FROM allocation WHERE hotkey = ?", + (axon.hotkey,) + ) + row = cursor.fetchone() + + if row: + info = json.loads(row[0]) # Parse JSON string from the 'details' column + public_key = info.get("regkey") + except Exception as e: + bt.logging.trace(f"{axon.hotkey}: Missing public key: {e}") + + try: + dendrite = bt.dendrite(wallet=self.wallet) + retry_count = 0 + max_retries = 3 + allocation_status = True + + while allocation_status and retry_count < max_retries: + try: + # Send deallocation query + deregister_response = dendrite.query( + axon, + Allocate( + timeline=0, + checking=False, + public_key=public_key, + ), + timeout=60, + ) + + if deregister_response and deregister_response.get("status") is True: + allocation_status = False + bt.logging.trace(f"Deallocated miner {axon.hotkey}") + else: + retry_count += 1 + bt.logging.trace( + f"{axon.hotkey}: Failed to deallocate miner. " + f"(attempt {retry_count}/{max_retries})" + ) + if retry_count >= max_retries: + bt.logging.trace(f"{axon.hotkey}: Max retries reached for deallocating miner.") + time.sleep(5) + except Exception as e: + retry_count += 1 + bt.logging.trace( + f"{axon.hotkey}: Error while trying to deallocate miner. " + f"(attempt {retry_count}/{max_retries}): {e}" + ) + if retry_count >= max_retries: + bt.logging.trace(f"{axon.hotkey}: Max retries reached for deallocating miner.") + time.sleep(5) + except Exception as e: + bt.logging.trace(f"{axon.hotkey}: Unexpected error during deallocation: {e}") def set_weights(self): # Remove all negative scores and attribute them 0. @@ -754,17 +1017,16 @@ async def start(self): self.loop = asyncio.get_running_loop() # Step 5: Perform queries to miners, scoring, and weight - block_next_challenge = 1 + block_next_pog = 1 block_next_sync_status = 1 block_next_set_weights = self.current_block + weights_rate_limit block_next_hardware_info = 1 block_next_miner_checking = 1 - time_next_challenge = None + time_next_pog = None time_next_sync_status = None time_next_set_weights = None time_next_hardware_info = None - time_next_miner_checking = None bt.logging.info("Starting validator loop.") while True: @@ -774,67 +1036,23 @@ async def start(self): if self.current_block not in self.blocks_done: self.blocks_done.add(self.current_block) - time_next_challenge = self.next_info(not block_next_challenge == 1, block_next_challenge) + time_next_pog = self.next_info(not block_next_pog == 1, block_next_pog) time_next_sync_status = self.next_info(not block_next_sync_status == 1, block_next_sync_status) time_next_set_weights = self.next_info(not block_next_set_weights == 1, block_next_set_weights) time_next_hardware_info = self.next_info( not block_next_hardware_info == 1 and self.validator_perform_hardware_query, block_next_hardware_info ) - time_next_miner_checking = self.next_info(not block_next_miner_checking == 1, block_next_miner_checking) - - # Perform pow queries - if self.current_block % block_next_challenge == 0 or block_next_challenge < self.current_block: - # Next block the validators will challenge again. - block_next_challenge = self.current_block + random.randint(50, 80) # 50,80 -> between ~ 10 and 16 minutes - # Filter axons with stake and ip address. - self._queryable_uids = self.get_queryable() + # Perform proof of GPU (pog) queries + if self.current_block % block_next_pog == 0 or block_next_pog < self.current_block: + block_next_pog = self.current_block + 360 - self.pow_requests = {} - self.new_pow_benchmark = {} - - self.threads = [] - for i in range(0, len(self.uids), self.validator_challenge_batch_size): - for _uid in self.uids[i : i + self.validator_challenge_batch_size]: - try: - axon = self._queryable_uids[_uid] - if axon.hotkey in self.allocated_hotkeys: - continue - difficulty = self.calc_difficulty(_uid) - password, _hash, _salt, mode, chars, mask = run_validator_pow(length=difficulty) - self.pow_requests[_uid] = (password, _hash, _salt, mode, chars, mask, difficulty) - self.threads.append( - threading.Thread( - target=self.execute_pow_request, - args=(_uid, axon, _hash, _salt, mode, chars, mask, difficulty), - name=f"th_execute_pow_request-{_uid}", - daemon=True, - ) - ) - except KeyError: - continue - - for thread in self.threads: - thread.start() - - for thread in self.threads: - thread.join() - - self.pow_benchmark = self.new_pow_benchmark - self.pow_benchmark_success = {k: v for k, v in self.pow_benchmark.items() if v["success"] is True and v["elapsed_time"] < pow_timeout} - - # Logs benchmarks for the validators - if len(self.pow_benchmark_success) > 0: - bt.logging.info("✅ Results success benchmarking:") - for uid, benchmark in self.pow_benchmark_success.items(): - bt.logging.info(f"{uid}: {benchmark}") + if self.gpu_task is None or self.gpu_task.done(): + # Schedule proof_of_gpu as a background task + self.gpu_task = asyncio.create_task(self.proof_of_gpu()) + self.gpu_task.add_done_callback(self.on_gpu_task_done) else: - bt.logging.warning("❌ Benchmarking: All miners failed. An issue occurred.") - - pow_benchmarks_list = [{**values, "uid": uid} for uid, values in self.pow_benchmark.items()] - update_challenge_details(self.db, pow_benchmarks_list) - - self.sync_scores() + bt.logging.info("Proof-of-GPU task is already running.") # Perform specs queries if (self.current_block % block_next_hardware_info == 0 and self.validator_perform_hardware_query) or ( @@ -851,12 +1069,12 @@ async def start(self): # Perform miner checking if self.current_block % block_next_miner_checking == 0 or block_next_miner_checking < self.current_block: # Next block the validators will do port checking again. - block_next_miner_checking = self.current_block + 50 # 50 -> every 10 minutes + block_next_miner_checking = self.current_block + 50 # 300 -> every 60 minutes # Filter axons with stake and ip address. self._queryable_uids = self.get_queryable() - #self.sync_checklist() + # self.sync_checklist() if self.current_block % block_next_sync_status == 0 or block_next_sync_status < self.current_block: block_next_sync_status = self.current_block + 25 # ~ every 5 minutes @@ -888,14 +1106,13 @@ async def start(self): f"Rank:{self.metagraph.R[self.validator_subnet_uid]} | " f"vTrust:{self.metagraph.validator_trust[self.validator_subnet_uid]} | " f"Emission:{self.metagraph.E[self.validator_subnet_uid]} | " - f"next_challenge: #{block_next_challenge} ~ {time_next_challenge} | " + f"next_pog: #{block_next_pog} ~ {time_next_pog} | " f"sync_status: #{block_next_sync_status} ~ {time_next_sync_status} | " f"set_weights: #{block_next_set_weights} ~ {time_next_set_weights} | " - f"hardware_info: #{block_next_hardware_info} ~ {time_next_hardware_info} |" - f"miner_checking: #{block_next_miner_checking} ~ {time_next_miner_checking}" + f"wandb_info: #{block_next_hardware_info} ~ {time_next_hardware_info} |" ) ) - time.sleep(1) + await asyncio.sleep(1) # If we encounter an unexpected error, log it for debugging. except RuntimeError as e: @@ -921,4 +1138,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5598ec5..f6608eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,5 @@ pyfiglet==1.0.2 python-dotenv==1.0.1 requests==2.31.0 paramiko==3.4.1 - +blake3 +ipwhois==1.3.0 diff --git a/scripts/opencompute/.env.example b/scripts/opencompute/.env.example new file mode 100644 index 0000000..0b5d19f --- /dev/null +++ b/scripts/opencompute/.env.example @@ -0,0 +1 @@ +WANDB_API_KEY="your_api_key" diff --git a/scripts/opencompute/icon.ico b/scripts/opencompute/icon.ico new file mode 100644 index 0000000..bdc6279 Binary files /dev/null and b/scripts/opencompute/icon.ico differ diff --git a/scripts/opencompute/main.py b/scripts/opencompute/main.py new file mode 100644 index 0000000..c9334a2 --- /dev/null +++ b/scripts/opencompute/main.py @@ -0,0 +1,116 @@ +# Streamlit main script +import streamlit as st +import os +from dotenv import load_dotenv +import pandas as pd +import requests +import time + +# Configure the page to use wide layout +st.set_page_config(page_title="Opencompute", layout="wide", page_icon="icon.ico") + +# Server details, insert the server IP and port +SERVER_IP = "" +SERVER_PORT = "" +SERVER_URL = f"http://{SERVER_IP}:{SERVER_PORT}" + +def get_data_from_server(endpoint): + response = requests.get(f"{SERVER_URL}/{endpoint}") + if response.status_code == 200: + return response.json() + else: + return {} + +def display_hardware_specs(specs_details, allocated_keys, penalized_keys): + # Compute all necessary data before setting up the tabs + column_headers = ["UID", "Hotkey", "GPU Name", "GPU Capacity (GiB)", "GPU Count", "CPU Count", "RAM (GiB)", "Disk Space (GiB)", "Status", "Conformity"] + table_data = [] + + gpu_instances = {} + total_gpu_counts = {} + + for index in sorted(specs_details.keys()): + hotkey = specs_details[index]['hotkey'] + details = specs_details[index]['details'] + if details: + try: + gpu_miner = details['gpu'] + gpu_capacity = "{:.2f}".format(gpu_miner['capacity'] / 1024) # Capacity is in MiB + gpu_name = str(gpu_miner['details'][0]['name']).lower() + gpu_count = gpu_miner['count'] + + cpu_miner = details['cpu'] + cpu_count = cpu_miner['count'] + + ram_miner = details['ram'] + ram = "{:.2f}".format(ram_miner['available'] / 1024.0 ** 3) # Convert bytes to GiB + + hard_disk_miner = details['hard_disk'] + hard_disk = "{:.2f}".format(hard_disk_miner['free'] / 1024.0 ** 3) # Convert bytes to GiB + + status = "Res." if hotkey in allocated_keys else "Avail." + conform = "No" if hotkey in penalized_keys else "Yes" + + row = [str(index), hotkey[:6] + ('...'), gpu_name, gpu_capacity, str(gpu_count), str(cpu_count), ram, hard_disk, status, conform] + + # Update summaries for GPU instances and total counts + if isinstance(gpu_name, str) and isinstance(gpu_count, int): + row = [str(index), hotkey[:6] + ('...'), gpu_name, gpu_capacity, str(gpu_count), str(cpu_count), ram, hard_disk, status, conform] + gpu_key = (gpu_name, gpu_count) + gpu_instances[gpu_key] = gpu_instances.get(gpu_key, 0) + 1 + total_gpu_counts[gpu_name] = total_gpu_counts.get(gpu_name, 0) + gpu_count + else: + row = [str(index), hotkey[:6] + ('...'), "No GPU data"] + ["N/A"] * 7 + + except (KeyError, IndexError, TypeError): + row = [str(index), hotkey[:6] + ('...'), "Invalid details"] + ["N/A"] * 7 + else: + row = [str(index), hotkey[:6] + ('...'), "No details available"] + ["N/A"] * 7 + + table_data.append(row) + + # Display the tabs + tab1, tab2, tab3 = st.tabs(["Hardware Overview", "Instances Summary", "Total GPU Counts"]) + + with tab1: + df = pd.DataFrame(table_data, columns=column_headers) + st.table(df) + + with tab2: + summary_data = [[gpu_name, str(gpu_count), str(instances)] for (gpu_name, gpu_count), instances in gpu_instances.items()] + if summary_data: + st.table(pd.DataFrame(summary_data, columns=["GPU Name", "GPU Count", "Instances Count"])) + + with tab3: + summary_data = [[name, str(count)] for name, count in total_gpu_counts.items()] + if summary_data: + st.table(pd.DataFrame(summary_data, columns=["GPU Name", "Total GPU Count"])) + +# Streamlit App Layout +st.title('Compute Subnet - Hardware Specifications') + +# Fetching data from external server +with st.spinner('Fetching data from server...'): + try: + hotkeys_response = get_data_from_server("keys") + hotkeys = hotkeys_response.get("keys", []) + + specs_response = get_data_from_server("specs") + specs_details = specs_response.get("specs", {}) + + allocated_keys_response = get_data_from_server("allocated_keys") + allocated_keys = allocated_keys_response.get("allocated_keys", []) + + penalized_keys_response = get_data_from_server("penalized_keys") + penalized_keys = penalized_keys_response.get("penalized_keys", []) + + except: + print("Error: ConnectionError") + +# Display fetched hardware specs +try: + display_hardware_specs(specs_details, allocated_keys, penalized_keys) +except: + st.write("Unable to connect to the server. Please try again later.") + print("Error: ConnectionError occurred while attempting to connect to the server.") + diff --git a/scripts/opencompute/requirements.txt b/scripts/opencompute/requirements.txt new file mode 100644 index 0000000..53313bd --- /dev/null +++ b/scripts/opencompute/requirements.txt @@ -0,0 +1,4 @@ +streamlit==1.33.0 # Adjust the version based on what you're currently using or the latest available +wandb==0.16.6 # Adjust according to the version you wish to use +python-dotenv==1.0.1 # For loading environment variables +pandas==2.2.2 # Adjust based on your current or desired version diff --git a/scripts/opencompute/server.py b/scripts/opencompute/server.py new file mode 100644 index 0000000..730269e --- /dev/null +++ b/scripts/opencompute/server.py @@ -0,0 +1,175 @@ +from fastapi import FastAPI +from typing import Dict, List, Any +import bittensor as bt +import wandb +import os +from dotenv import load_dotenv +import asyncio +from concurrent.futures import ThreadPoolExecutor + +app = FastAPI() + +# Load environment variables +load_dotenv() +api_key = os.getenv("WANDB_API_KEY") + +# Constants for W&B +PUBLIC_WANDB_NAME = "opencompute" +PUBLIC_WANDB_ENTITY = "neuralinternet" + +# Initialize the Bittensor metagraph with the specified netuid +metagraph = bt.metagraph(netuid=27) + +# Cache to store fetched data +hardware_specs_cache: Dict[int, Dict[str, Any]] = {} +allocated_hotkeys_cache: List[str] = [] +penalized_hotkeys_cache: List[str] = [] + +# Create a ThreadPoolExecutor +executor = ThreadPoolExecutor(max_workers=4) + +# Function to fetch hardware specs from wandb +def fetch_hardware_specs(api, hotkeys: List[str]) -> Dict[int, Dict[str, Any]]: + db_specs_dict: Dict[int, Dict[str, Any]] = {} + project_path = f"{PUBLIC_WANDB_ENTITY}/{PUBLIC_WANDB_NAME}" + runs = api.runs(project_path) + try: + for run in runs: + run_config = run.config + hotkey = run_config.get('hotkey') + details = run_config.get('specs') + role = run_config.get('role') + if hotkey in hotkeys and isinstance(details, dict) and role == 'miner': + index = hotkeys.index(hotkey) + db_specs_dict[index] = {"hotkey": hotkey, "details": details} + except Exception as e: + print(f"An error occurred while getting specs from wandb: {e}") + return db_specs_dict + +# Function to get all allocated hotkeys from all validators +def get_allocated_hotkeys(api) -> List[str]: + api.flush() + runs = api.runs(f"{PUBLIC_WANDB_ENTITY}/{PUBLIC_WANDB_NAME}") + + if not runs: + print("No validator info found in the project opencompute.") + return [] + + validator_runs = [run for run in runs if run.config.get('role') == 'validator'] + allocated_keys_list: List[str] = [] + + for run in validator_runs: + try: + run_config = run.config + allocated_keys = run_config.get('allocated_hotkeys') + if allocated_keys: + allocated_keys_list.extend(allocated_keys) + except Exception as e: + print(f"Run ID: {run.id}, Name: {run.name}, Error: {e}") + + return allocated_keys_list + +# Function to get penalized hotkeys from a specific validator run +def get_penalized_hotkeys_id(api, run_id: str) -> List[str]: + api.flush() + + # Fetch the specific run by its ID + run = api.run(run_id) + + if not run: + print(f"No run info found for ID {run_id}.") + return [] + + penalized_keys_list: List[str] = [] + + try: + run_config = run.config + # Updated to get the checklist of penalized hotkeys + penalized_hotkeys_checklist = run_config.get('penalized_hotkeys_checklist', []) + if penalized_hotkeys_checklist: + # Loop through the checklist and extract the 'hotkey' field + for entry in penalized_hotkeys_checklist: + # hotkey = entry.get('hotkey') + #if hotkey: + penalized_keys_list.append(entry) + except Exception as e: + print(f"Run ID: {run.id}, Name: {run.name}, Error: {e}") + + return penalized_keys_list + +# Function to get penalized hotkeys +def get_penalized_hotkeys(api) -> List[str]: + api.flush() + runs = api.runs(f"{PUBLIC_WANDB_ENTITY}/{PUBLIC_WANDB_NAME}") + + if not runs: + print("No validator info found in the project opencompute.") + return [] + + validator_runs = [run for run in runs if run.config.get('role') == 'validator'] + penalized_keys_list: List[str] = [] + + for run in validator_runs: + try: + run_config = run.config + # Updated to get the checklist of penalized hotkeys + penalized_hotkeys_checklist = run_config.get('penalized_hotkeys_checklist', []) + if penalized_hotkeys_checklist: + # Loop through the checklist and extract the 'hotkey' field + for entry in penalized_hotkeys_checklist: + hotkey = entry.get('hotkey') + if hotkey: + penalized_keys_list.append(hotkey) + except Exception as e: + print(f"Run ID: {run.id}, Name: {run.name}, Error: {e}") + + return penalized_keys_list + +# Background task to sync the metagraph and fetch hardware specs and allocated hotkeys periodically +async def sync_data_periodically(): + global hardware_specs_cache, allocated_hotkeys_cache, penalized_hotkeys_cache + while True: + try: + metagraph.sync() + + # Run the blocking W&B API calls in a separate thread + loop = asyncio.get_event_loop() + wandb.login(key=api_key) + api = wandb.Api() + + hotkeys = metagraph.hotkeys + + hardware_specs_cache = await loop.run_in_executor(executor, fetch_hardware_specs, api, hotkeys) + allocated_hotkeys_cache = await loop.run_in_executor(executor, get_allocated_hotkeys, api) + #penalized_hotkeys_cache = await loop.run_in_executor(executor, get_penalized_hotkeys, api) + penalized_hotkeys_cache = await loop.run_in_executor(executor, get_penalized_hotkeys_id, api, "neuralinternet/opencompute/dvgtj3dr") + + except Exception as e: + print(f"An error occurred during periodic sync: {e}") + + await asyncio.sleep(600) # Sleep for 10 minutes + +@app.on_event("startup") +async def startup_event(): + asyncio.create_task(sync_data_periodically()) + +@app.get("/keys") +async def get_keys() -> Dict[str, List[str]]: + hotkeys = metagraph.hotkeys + return {"keys": hotkeys} + +@app.get("/specs") +async def get_specs() -> Dict[str, Dict[int, Dict[str, Any]]]: + return {"specs": hardware_specs_cache} + +@app.get("/allocated_keys") +async def get_allocated_keys() -> Dict[str, List[str]]: + return {"allocated_keys": allocated_hotkeys_cache} + +@app.get("/penalized_keys") +async def get_penalized_keys() -> Dict[str, List[str]]: + return {"penalized_keys": penalized_hotkeys_cache} + +# To run the server (example): +# uvicorn server:app --reload --host 0.0.0.0 --port 8316 +# pm2 start uvicorn --interpreter python3 --name opencompute_server -- --host 0.0.0.0 --port 8000 server:app \ No newline at end of file