Skip to content

Commit

Permalink
Merge pull request #71 from SkywardAI/fix/downloader
Browse files Browse the repository at this point in the history
fix/auto_downloader add token file
  • Loading branch information
Aisuko authored Apr 19, 2024
2 parents 3f838e8 + 4c115df commit 44bd9d5
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 6 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
params/
params/

model/
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "kimchima"
version = "0.4.7"
version = "0.4.8"
description = "The collections of tools for ML model development."
authors = ["Aisuko <urakiny@gmail.com>"]
license = "Apache-2.0"
Expand Down
27 changes: 23 additions & 4 deletions src/kimchima/utils/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
AutoTokenizer,
AutoModelForCausalLM,
)
import shutil
import os

logger=logging.get_logger(__name__)

Expand All @@ -34,7 +36,14 @@ def __init__(self):
"Embeddings is designed to be instantiated "
"using the `Embeddings.from_pretrained(pretrained_model_name_or_path)` method."
)

def _move_files_and_remove_dir(src_folder, dst_folder):
for filename in os.listdir(src_folder):
dst_file = os.path.join(dst_folder, filename)
if os.path.exists(dst_file):
os.remove(dst_file)
shutil.move(os.path.join(src_folder, filename), dst_folder)
shutil.rmtree(src_folder)

@classmethod
def model_downloader(cls, *args, **kwargs):
r"""
Expand Down Expand Up @@ -66,9 +75,18 @@ def auto_downloader(cls, *args, **kwargs):
if model_name is None:
raise ValueError("model_name is required")
folder_name=kwargs.pop("folder_name", None)

if folder_name is None:
folder_name = model_name
model=AutoModel.from_pretrained(model_name)
model.save_pretrained(folder_name if folder_name is not None else model_name)
# save_pretrained only saves the model weights, not the configuration
model.save_pretrained(folder_name )

tokenizer=AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(folder_name + "/tmp1", legacy_format=False)
tokenizer.save_pretrained(folder_name + "/tmp2", legacy_format=True)

for tmp_folder in ["/tmp1", "/tmp2"]:
cls._move_files_and_remove_dir(folder_name+ tmp_folder, folder_name)
logger.info(f"Model {model_name} has been downloaded successfully")


Expand Down Expand Up @@ -104,4 +122,5 @@ def auto_token_downloader(cls, *args, **kwargs):

tokenizer=AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(folder_name if folder_name is not None else model_name)
logger.info(f"Tokenizer {model_name} has been downloaded successfully")
logger.info(f"Tokenizer {model_name} has been downloaded successfully")

0 comments on commit 44bd9d5

Please sign in to comment.