diff --git a/.flake8 b/.flake8 index 73d61c7e..29658d00 100644 --- a/.flake8 +++ b/.flake8 @@ -2,3 +2,4 @@ exclude = herm/models/openassistant.py herm/models/starling.py +extend-ignore = E203 \ No newline at end of file diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index dbe01a20..07f1c437 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -24,7 +24,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install ".[quality]" + python -m pip install . - name: Code quality run: | make quality \ No newline at end of file diff --git a/herm/dpo.py b/herm/dpo.py index ecfd21be..a09fabe5 100644 --- a/herm/dpo.py +++ b/herm/dpo.py @@ -277,11 +277,14 @@ def get_batch_logps( Args: logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size) - labels: Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are ignored. Shape: (batch_size, sequence_length) - average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens. + labels: Labels for which to compute the log probabilities. Label tokens with a value of + label_pad_token_id are ignored. Shape: (batch_size, sequence_length) + average_log_prob: If True, return the average log probability per (non-masked) token. + Otherwise, return the sum of the log probabilities of the (non-masked) tokens. Returns: - A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits. + A tensor of shape (batch_size,) containing the average/sum log probabilities + of the given labels under the given logits. """ if logits.shape[:-1] != labels.shape: raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.") @@ -312,7 +315,8 @@ def concatenated_inputs( """Concatenate the chosen and rejected inputs into a single tensor. Args: - batch: A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors of shape (batch_size, sequence_length). + batch: A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', + which are tensors of shape (batch_size, sequence_length). is_encoder_decoder: Whether the model is an encoder-decoder model. label_pad_token_id: The label pad token id. padding_value: The padding value to use for the concatenated inputs_ids. diff --git a/herm/models/openbmb.py b/herm/models/openbmb.py index d04182f4..4c16e691 100644 --- a/herm/models/openbmb.py +++ b/herm/models/openbmb.py @@ -27,7 +27,7 @@ def __init__(self, task, model, tokenizer): self.tokenizer = tokenizer def __call__(self, samples, **kwargs): - batch_size = kwargs.get("batch_size", 1) + _ = kwargs.get("batch_size", 1) truncation = kwargs.get("truncation", True) padding = kwargs.get("padding", True) max_length = kwargs.get("max_length", 2048) diff --git a/herm/models/shp.py b/herm/models/shp.py index 94209a25..a8207d81 100644 --- a/herm/models/shp.py +++ b/herm/models/shp.py @@ -59,7 +59,10 @@ def __call__(self, candidates_A: List[List[Dict]], candidates_B: List[List[Dict] Pass it into the model, decide on winner. From the model readme: - >> input_text = "POST: Instacart gave me 50 pounds of limes instead of 5 pounds... what the hell do I do with 50 pounds of limes? I've already donated a bunch and gave a bunch away. I'm planning on making a bunch of lime-themed cocktails, but... jeez. Ceviche? \n\n RESPONSE A: Lime juice, and zest, then freeze in small quantities.\n\n RESPONSE B: Lime marmalade lol\n\n Which response is better? RESPONSE" + >> input_text = "POST: Instacart gave me 50 pounds of limes instead of 5 pounds... + what the hell do I do with 50 pounds of limes? I've already donated a bunch and gave a bunch away. + I'm planning on making a bunch of lime-themed cocktails, but... jeez. Ceviche? \n\n RESPONSE A: Lime juice, + then freeze in small quantities.\n\n RESPONSE B: Lime marmalade lol\n\n Which response is better? RESPONSE" >> x = tokenizer([input_text], return_tensors='pt').input_ids.to(device) >> y = model.generate(x, max_new_tokens=1) >> tokenizer.batch_decode(y, skip_special_tokens=True) diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py index ff6daa6a..1c826f84 100644 --- a/scripts/run_dpo.py +++ b/scripts/run_dpo.py @@ -18,36 +18,26 @@ import os import sys +import numpy as np import torch import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from datasets import load_dataset from fastchat.conversation import get_conv_template -from huggingface_hub import upload_file +from huggingface_hub import HfApi from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer from trl.trainer.utils import DPODataCollatorWithPadding -from herm import DPOInference, prepare_dialogue, prepare_dialogue_from_tokenizer +from herm import DPOInference, load_eval_dataset + +# get token from HF_TOKEN env variable, but if it doesn't exist pass none +HF_TOKEN = os.getenv("HF_TOKEN", None) +api = HfApi(token=HF_TOKEN) # data repo to upload results EVAL_REPO = "ai2-rlhf-collab/rm-benchmark-results" -EVAL_SUBSETS = [ - "alpacaeval-easy", - "alpacaeval-hard", - "alpacaeval-length", - "llmbar-adver-GPTInst", - "llmbar-adver-GPTOut", - "llmbar-adver-manual", - "llmbar-adver-neighbor", - "llmbar-natural", - "mt-bench-easy", - "mt-bench-hard", - "mt-bench-med", - "refusals-dangerous", - "refusals-offensive", -] +PREFS_REPO = "ai2-rlhf-collab/rm-testset-results" def get_args(): @@ -64,6 +54,9 @@ def get_args(): parser.add_argument("--direct_load", action="store_true", help="directly load model instead of pipeline") parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)") parser.add_argument("--batch_size", type=int, default=64, help="batch size for inference") + parser.add_argument( + "--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set" + ) args = parser.parse_args() return args @@ -96,36 +89,18 @@ def main(): conv = get_conv_template(chat_template) ############################ - # Load dataset from ai2-rlhf-collab/rm-benchmark-dev, "filtered" split + # Load dataset ############################ logger.info("*** Load dataset ***") - raw_dataset = load_dataset("ai2-rlhf-collab/rm-benchmark-dev", split="filtered") - tokenizer_path = args.tokenizer if args.tokenizer else args.model tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - # if tokenizer.chat_template exists, use that - if hasattr(tokenizer, "chat_template"): - ex = prepare_dialogue_from_tokenizer(raw_dataset[0], tokenizer) - # docs https://huggingface.co/docs/transformers/main/en/chat_templating - # double up to bypass some weid bug - dataset = raw_dataset.map( - prepare_dialogue_from_tokenizer, - fn_kwargs={"tokenizer": tokenizer}, - ) - dataset = dataset.map( - prepare_dialogue_from_tokenizer, - fn_kwargs={"tokenizer": tokenizer}, - ) - # else use FastChat to get chat template - else: - dataset = raw_dataset.map( - prepare_dialogue, - fn_kwargs={"dialogue_template": conv}, - ) - dataset = dataset.map( - prepare_dialogue, - fn_kwargs={"dialogue_template": conv}, - ) + dataset, subsets = load_eval_dataset( + core_set=not args.pref_sets, + conv=conv, + tokenizer=tokenizer, + logger=logger, + keep_columns=["text_chosen", "text_rejected"], + ) ############################ # Load reward model pipeline @@ -192,6 +167,9 @@ def main(): for chosen, rejected in zip(score_chosen, score_rejected) ] + ############################ + # Print & process results + ############################ # add column for results for easy printing out_dataset = dataset.add_column("results", results) @@ -199,7 +177,8 @@ def main(): results["model"] = args.model results["chat_template"] = args.chat_template # print per subset and log into results file - for subset in EVAL_SUBSETS: + present_subsets = np.unique(subsets) + for subset in present_subsets: subset_dataset = out_dataset.filter(lambda example: example["subset"] == subset) num_correct = sum(subset_dataset["results"]) num_total = len(subset_dataset["results"]) @@ -227,10 +206,10 @@ def main(): # Upload results as json if not args.do_not_save: - scores_url = upload_file( + scores_url = api.upload_file( path_or_fileobj=path, path_in_repo=f"data/{args.model}.json", - repo_id=EVAL_REPO, + repo_id=EVAL_REPO if not args.pref_sets else PREFS_REPO, # push to correct results repo repo_type="dataset", commit_message=f"Add reward model scores for model {args.model}", ) diff --git a/scripts/run_rm.py b/scripts/run_rm.py index 71bb4dbb..af86c332 100644 --- a/scripts/run_rm.py +++ b/scripts/run_rm.py @@ -126,7 +126,7 @@ def main(): conv = get_conv_template(chat_template) ############################ - # Load dataset from ai2-rlhf-collab/rm-benchmark-dev, "filtered" split + # Load dataset ############################ logger.info("*** Load dataset ***") tokenizer_path = args.tokenizer if args.tokenizer else args.model @@ -134,6 +134,7 @@ def main(): dataset, subsets = load_eval_dataset( core_set=not args.pref_sets, conv=conv, + custom_dialogue_formatting=custom_dialogue, tokenizer=tokenizer, logger=logger, keep_columns=["text_chosen", "text_rejected"], @@ -192,7 +193,7 @@ def main(): # first, handle custom pipelines that we must batch normally if not args.direct_load or pipeline_builder == pipeline: logger.info("*** Running forward pass via built in pipeline abstraction ***") - # this setup can be optimized slightly with one pipeline call, I just find the logic here more failsafe on correct indexing + # this setup can be optimized slightly with one pipeline call # prepare for inference reward_pipe = accelerator.prepare(reward_pipe) diff --git a/setup.py b/setup.py index 9abacaf3..c6612f59 100644 --- a/setup.py +++ b/setup.py @@ -11,49 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re from setuptools import find_packages, setup -_deps = [ - "accelerate", - "bitsandbytes", - "black==23.1.0", - "datasets", - "flake8>=6.0", - "fschat[model_worker,webui]", - "huggingface_hub", - "isort>=5.12.0", - "pytest", - "scipy", - "tokenizers", - "transformers", - "trl>=0.7.7", -] -deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ \[\]]+)(?:\[[^\]]+\])?(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)} - - -def deps_list(*pkgs): - return [deps[pkg] for pkg in pkgs] - - -extras = {} -extras["quality"] = deps_list("black", "isort", "flake8") -extras["tests"] = deps_list("pytest") - -install_requires = [ - deps["accelerate"], - deps["bitsandbytes"], - deps["datasets"], - deps["fschat"], - deps["huggingface_hub"], - deps["scipy"], - deps["tokenizers"], - deps["transformers"], - deps["trl"], -] - - setup( name="herm", version="0.1.0.dev", @@ -71,6 +31,19 @@ def deps_list(*pkgs): "Operating System :: OS Independent", ], python_requires=">=3.10", - package_dir={"": "herm"}, - install_requires=install_requires, + install_requires=[ + "accelerate", + "bitsandbytes", + "black==23.1.0", + "datasets", + "flake8>=6.0", + "fschat[model_worker,webui]", + "huggingface_hub", + "isort>=5.12.0", + "pytest", + "scipy", + "tokenizers", + "transformers", + "trl>=0.7.7", + ], ) diff --git a/tests/test_data.py b/tests/test_data.py index f0fda6ee..1335cb5b 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -17,7 +17,7 @@ from fastchat.conversation import get_conv_template from transformers import AutoTokenizer -from herm import load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer +from herm import prepare_dialogue, prepare_dialogue_from_tokenizer class PrepareDialoguesTest(unittest.TestCase):