Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Specify UTF-8 encoding for every open function #1283

Merged
merged 3 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion litgpt/data/tinystories.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def val_dataloader(self) -> DataLoader:


def tokenize(filename: str, tokenizer: Tokenizer):
with open(filename, "r") as f:
with open(filename, "r", encoding="utf-8") as f:
data = json.load(f)
global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"])
num_workers = int(os.environ["DATA_OPTIMIZER_NUM_WORKERS"])
Expand Down
2 changes: 1 addition & 1 deletion litgpt/eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def convert_and_evaluate(
save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath)
config_filepath = checkpoint_dir/"model_config.yaml"

with open(config_filepath) as f:
with open(config_filepath, encoding="utf-8") as f:
config_dict = yaml.safe_load(f)
repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}"

Expand Down
4 changes: 2 additions & 2 deletions litgpt/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,12 @@ def save_prompt_style(style: Union[str, PromptStyle], checkpoint_dir: Path) -> N
cls = type(style)
# Allow saving the full module path for user-defined prompt classes
config = {"class_path": f"{cls.__module__}.{cls.__name__}"}
with open(checkpoint_dir / "prompt_style.yaml", "w") as file:
with open(checkpoint_dir / "prompt_style.yaml", "w", encoding="utf-8") as file:
yaml.dump(config, file)


def load_prompt_style(checkpoint_dir: Path) -> PromptStyle:
with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
config = yaml.safe_load(file)
# Support loading the full module path for user-defined prompt classes
full_module_path, cls_name = config["class_path"].rsplit(".", 1)
Expand Down
2 changes: 1 addition & 1 deletion litgpt/scripts/convert_hf_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def convert_hf_checkpoint(
# Load the json file containing weight mapping
pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json"
if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file
with open(pytorch_bin_map_json_path) as json_map:
with open(pytorch_bin_map_json_path, encoding="utf-8") as json_map:
bin_index = json.load(json_map)
bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
else:
Expand Down
2 changes: 1 addition & 1 deletion litgpt/scripts/merge_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def load_lora_metadata(checkpoint_dir: Path) -> Tuple[Dict[str, Any], Path, Opti
f" the `litgpt/finetune/lora.py` script."
)

with open(hparams_file, "r") as file:
with open(hparams_file, "r", encoding="utf-8") as file:
hparams = yaml.safe_load(file)

lora_params = {k: v for k, v in hparams.items() if k.startswith("lora_")}
Expand Down
6 changes: 3 additions & 3 deletions litgpt/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
self.backend = "huggingface"

if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
with open(special_tokens_path) as fp:
with open(special_tokens_path, encoding="utf-8") as fp:
config = json.load(fp)
bos_token = config.get("bos_token")
self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None
eos_token = config.get("eos_token")
self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None
if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
with open(special_tokens_path) as fp:
with open(special_tokens_path, encoding="utf-8") as fp:
config = json.load(fp)
if self.bos_id is None:
self.bos_id = config.get("bos_token_id")
Expand Down Expand Up @@ -71,7 +71,7 @@ def token_to_id(self, token: str) -> int:
def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file():
return False
with open(tokenizer_config_path) as fp:
with open(tokenizer_config_path, encoding="utf-8") as fp:
config = json.load(fp)
if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
return True
Expand Down
2 changes: 1 addition & 1 deletion litgpt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def save_hyperparameters(function: callable, checkpoint_dir: Path) -> None:

def save_config(config: "Config", checkpoint_dir: Path) -> None:
config_dict = asdict(config)
with open(checkpoint_dir / "model_config.yaml", "w") as fp:
with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
yaml.dump(config_dict, fp)


Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_tinystories.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_tokenize(tmp_path, monkeypatch):
story1, story2 = "foo bar", " fun "
data = [{"story": story1}, {"story": story2}]
shard_path = tmp_path / "data.json"
with open(shard_path, "w") as f:
with open(shard_path, "w", encoding="utf-8") as f:
json.dump(data, f)

class Tokenizer:
Expand Down
4 changes: 2 additions & 2 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_from_checkpoint(tmp_path):

# 3. If only `lit_config.py` exists.
config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2}
with open(tmp_path / "model_config.yaml", "w") as file:
with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file:
yaml.dump(config_data, file)
config = Config.from_checkpoint(tmp_path)
assert config.name == "pythia-14m"
Expand All @@ -69,7 +69,7 @@ def test_from_checkpoint(tmp_path):

# 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config
(tmp_path / "pythia-14m").mkdir()
with open(tmp_path / "pythia-14m/model_config.yaml", "w") as file:
with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file:
yaml.dump(config_data, file)
config = Config.from_checkpoint(tmp_path / "pythia-14m")
assert config.name == "pythia-14m"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_convert_lit_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_convert_lit_checkpoint(tmp_path):
checkpoint_path = tmp_path / "lit_model.pth"
config_path = tmp_path / "model_config.yaml"
torch.save(ours_model.state_dict(), checkpoint_path)
with open(config_path, "w") as fp:
with open(config_path, "w", encoding="utf-8") as fp:
yaml.dump(asdict(ours_config), fp)
output_dir = tmp_path / "out_dir"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_evaluate_script(tmp_path, monkeypatch):
checkpoint_path = tmp_path / "lit_model.pth"
torch.save(ours_model.state_dict(), checkpoint_path)
config_path = tmp_path / "model_config.yaml"
with open(config_path, "w") as fp:
with open(config_path, "w", encoding="utf-8") as fp:
yaml.dump(asdict(ours_config), fp)

fn_kwargs = dict(
Expand Down
6 changes: 3 additions & 3 deletions tests/test_merge_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype)

# Create a fake pretrained checkpoint
config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16)
with open(pretrained_checkpoint_dir / "model_config.yaml", "w") as fp:
with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
yaml.dump(config, fp)
base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype)
state_dict = base_model.state_dict()
Expand All @@ -45,7 +45,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype)
assert len(state_dict) == 6
torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora")
hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs)
with open(lora_checkpoint_dir / "hyperparameters.yaml", "w") as file:
with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
yaml.dump(hparams, file)
shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml")

Expand Down Expand Up @@ -80,7 +80,7 @@ def test_load_lora_metadata(fake_checkpoint_dir):
load_lora_metadata(fake_checkpoint_dir)

hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16)
with open(fake_checkpoint_dir / "hyperparameters.yaml", "w") as file:
with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
yaml.dump(hparams, file)

lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_save_load_prompt_style(tmp_path):
assert not has_prompt_style(checkpoint_dir)
save_prompt_style("alpaca", checkpoint_dir)
assert has_prompt_style(checkpoint_dir)
with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
contents = yaml.safe_load(file)
assert contents == {"class_path": "litgpt.prompts.Alpaca"}
loaded = load_prompt_style(checkpoint_dir)
Expand All @@ -108,7 +108,7 @@ def test_save_load_prompt_style(tmp_path):
checkpoint_dir = tmp_path / "custom"
checkpoint_dir.mkdir()
save_prompt_style(CustomPromptStyle(), checkpoint_dir)
with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
contents = yaml.safe_load(file)
assert contents == {"class_path": "test_prompts.CustomPromptStyle"}
loaded = load_prompt_style(checkpoint_dir)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def test_save_hyperparameters(tmp_path):
with mock.patch("sys.argv", ["any.py", "--out_dir", str(tmp_path), "--foo", "True"]):
CLI(_test_function)

with open(tmp_path / "hyperparameters.yaml", "r") as file:
with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file:
hparams = yaml.full_load(file)

assert hparams["out_dir"] == str(tmp_path)
Expand All @@ -277,7 +277,7 @@ def test_save_hyperparameters_known_commands(command, tmp_path):
with mock.patch("sys.argv", [*command.split(" "), "--out_dir", str(tmp_path), "--foo", "True"]):
save_hyperparameters(_test_function2, tmp_path)

with open(tmp_path / "hyperparameters.yaml", "r") as file:
with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file:
hparams = yaml.full_load(file)

assert hparams["out_dir"] == str(tmp_path)
Expand Down
Loading