quantize_llms_to_gguf (1).py

# -*- coding: utf-8 -*-
"""Quantize LLMs to GGUF.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1kvvWdDKefT_IIZ5fvDku-B569kS486qh
"""

!git clone https://github.com/ggerganov/llama.cpp

!cd llama.cpp && LLAMA_CUBLAS=1 make && pip install -r requirements.txt

from huggingface_hub import snapshot_download

model_name = "Qwen/Qwen1.5-1.8B"

methods = ['q4_k_m']

base_model = "./original_model/"
quantized_path = "./quantized_model/"

snapshot_download(repo_id=model_name, local_dir=base_model , local_dir_use_symlinks=False)
original_model = quantized_path+'/FP16.gguf'

!mkdir ./quantized_model/

!python llama.cpp/convert-hf-to-gguf.py ./original_model/ --outtype f16 --outfile ./quantized_model/FP16.gguf

import os

for m in methods:
    qtype = f"{quantized_path}/{m.upper()}.gguf"
    os.system("./llama.cpp/quantize "+quantized_path+"/FP16.gguf "+qtype+" "+m)

! ./llama.cpp/main -m ./quantized_model/Q4_K_M.gguf -n 90 --repeat_penalty 1.0 --color -i -r "User:" -f llama.cpp/prompts/chat-with-bob.txt

from huggingface_hub import notebook_login
notebook_login()

from huggingface_hub import HfApi, HfFolder, create_repo, upload_file

model_path = "./quantized_model/Q4_K_M.gguf" # Your model's local path
repo_name = "qwen1.5-llm"  # Desired HF Hub repository name
repo_url = create_repo(repo_name, private=False)

api = HfApi()
api.upload_file(
    path_or_fileobj=model_path,
    path_in_repo="Q4_K_M.gguf",
    repo_id="skuma307/qwen1.5-llm",
    repo_type="model",
)