Skip to content

Commit

Permalink
quality
Browse files Browse the repository at this point in the history
  • Loading branch information
natolambert committed Jan 26, 2024
1 parent 4c24120 commit 9c849cc
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 99 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
exclude =
herm/models/openassistant.py
herm/models/starling.py
extend-ignore = E203
2 changes: 1 addition & 1 deletion .github/workflows/quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install ".[quality]"
python -m pip install .
- name: Code quality
run: |
make quality
12 changes: 8 additions & 4 deletions herm/dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,11 +277,14 @@ def get_batch_logps(
Args:
logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
labels: Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are ignored. Shape: (batch_size, sequence_length)
average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
labels: Labels for which to compute the log probabilities. Label tokens with a value of
label_pad_token_id are ignored. Shape: (batch_size, sequence_length)
average_log_prob: If True, return the average log probability per (non-masked) token.
Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
Returns:
A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
A tensor of shape (batch_size,) containing the average/sum log probabilities
of the given labels under the given logits.
"""
if logits.shape[:-1] != labels.shape:
raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
Expand Down Expand Up @@ -312,7 +315,8 @@ def concatenated_inputs(
"""Concatenate the chosen and rejected inputs into a single tensor.
Args:
batch: A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors of shape (batch_size, sequence_length).
batch: A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids',
which are tensors of shape (batch_size, sequence_length).
is_encoder_decoder: Whether the model is an encoder-decoder model.
label_pad_token_id: The label pad token id.
padding_value: The padding value to use for the concatenated inputs_ids.
Expand Down
2 changes: 1 addition & 1 deletion herm/models/openbmb.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, task, model, tokenizer):
self.tokenizer = tokenizer

def __call__(self, samples, **kwargs):
batch_size = kwargs.get("batch_size", 1)
_ = kwargs.get("batch_size", 1)
truncation = kwargs.get("truncation", True)
padding = kwargs.get("padding", True)
max_length = kwargs.get("max_length", 2048)
Expand Down
5 changes: 4 additions & 1 deletion herm/models/shp.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ def __call__(self, candidates_A: List[List[Dict]], candidates_B: List[List[Dict]
Pass it into the model, decide on winner.
From the model readme:
>> input_text = "POST: Instacart gave me 50 pounds of limes instead of 5 pounds... what the hell do I do with 50 pounds of limes? I've already donated a bunch and gave a bunch away. I'm planning on making a bunch of lime-themed cocktails, but... jeez. Ceviche? \n\n RESPONSE A: Lime juice, and zest, then freeze in small quantities.\n\n RESPONSE B: Lime marmalade lol\n\n Which response is better? RESPONSE"
>> input_text = "POST: Instacart gave me 50 pounds of limes instead of 5 pounds...
what the hell do I do with 50 pounds of limes? I've already donated a bunch and gave a bunch away.
I'm planning on making a bunch of lime-themed cocktails, but... jeez. Ceviche? \n\n RESPONSE A: Lime juice,
then freeze in small quantities.\n\n RESPONSE B: Lime marmalade lol\n\n Which response is better? RESPONSE"
>> x = tokenizer([input_text], return_tensors='pt').input_ids.to(device)
>> y = model.generate(x, max_new_tokens=1)
>> tokenizer.batch_decode(y, skip_special_tokens=True)
Expand Down
73 changes: 26 additions & 47 deletions scripts/run_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,26 @@
import os
import sys

import numpy as np
import torch
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import load_dataset
from fastchat.conversation import get_conv_template
from huggingface_hub import upload_file
from huggingface_hub import HfApi
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl.trainer.utils import DPODataCollatorWithPadding

from herm import DPOInference, prepare_dialogue, prepare_dialogue_from_tokenizer
from herm import DPOInference, load_eval_dataset

# get token from HF_TOKEN env variable, but if it doesn't exist pass none
HF_TOKEN = os.getenv("HF_TOKEN", None)
api = HfApi(token=HF_TOKEN)

# data repo to upload results
EVAL_REPO = "ai2-rlhf-collab/rm-benchmark-results"
EVAL_SUBSETS = [
"alpacaeval-easy",
"alpacaeval-hard",
"alpacaeval-length",
"llmbar-adver-GPTInst",
"llmbar-adver-GPTOut",
"llmbar-adver-manual",
"llmbar-adver-neighbor",
"llmbar-natural",
"mt-bench-easy",
"mt-bench-hard",
"mt-bench-med",
"refusals-dangerous",
"refusals-offensive",
]
PREFS_REPO = "ai2-rlhf-collab/rm-testset-results"


def get_args():
Expand All @@ -64,6 +54,9 @@ def get_args():
parser.add_argument("--direct_load", action="store_true", help="directly load model instead of pipeline")
parser.add_argument("--do_not_save", action="store_true", help="do not save results to hub (for debugging)")
parser.add_argument("--batch_size", type=int, default=64, help="batch size for inference")
parser.add_argument(
"--pref_sets", action="store_true", help="run on common preference sets instead of our custom eval set"
)
args = parser.parse_args()
return args

Expand Down Expand Up @@ -96,36 +89,18 @@ def main():
conv = get_conv_template(chat_template)

############################
# Load dataset from ai2-rlhf-collab/rm-benchmark-dev, "filtered" split
# Load dataset
############################
logger.info("*** Load dataset ***")
raw_dataset = load_dataset("ai2-rlhf-collab/rm-benchmark-dev", split="filtered")

tokenizer_path = args.tokenizer if args.tokenizer else args.model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# if tokenizer.chat_template exists, use that
if hasattr(tokenizer, "chat_template"):
ex = prepare_dialogue_from_tokenizer(raw_dataset[0], tokenizer)
# docs https://huggingface.co/docs/transformers/main/en/chat_templating
# double up to bypass some weid bug
dataset = raw_dataset.map(
prepare_dialogue_from_tokenizer,
fn_kwargs={"tokenizer": tokenizer},
)
dataset = dataset.map(
prepare_dialogue_from_tokenizer,
fn_kwargs={"tokenizer": tokenizer},
)
# else use FastChat to get chat template
else:
dataset = raw_dataset.map(
prepare_dialogue,
fn_kwargs={"dialogue_template": conv},
)
dataset = dataset.map(
prepare_dialogue,
fn_kwargs={"dialogue_template": conv},
)
dataset, subsets = load_eval_dataset(
core_set=not args.pref_sets,
conv=conv,
tokenizer=tokenizer,
logger=logger,
keep_columns=["text_chosen", "text_rejected"],
)

############################
# Load reward model pipeline
Expand Down Expand Up @@ -192,14 +167,18 @@ def main():
for chosen, rejected in zip(score_chosen, score_rejected)
]

############################
# Print & process results
############################
# add column for results for easy printing
out_dataset = dataset.add_column("results", results)

results = {}
results["model"] = args.model
results["chat_template"] = args.chat_template
# print per subset and log into results file
for subset in EVAL_SUBSETS:
present_subsets = np.unique(subsets)
for subset in present_subsets:
subset_dataset = out_dataset.filter(lambda example: example["subset"] == subset)
num_correct = sum(subset_dataset["results"])
num_total = len(subset_dataset["results"])
Expand Down Expand Up @@ -227,10 +206,10 @@ def main():

# Upload results as json
if not args.do_not_save:
scores_url = upload_file(
scores_url = api.upload_file(
path_or_fileobj=path,
path_in_repo=f"data/{args.model}.json",
repo_id=EVAL_REPO,
repo_id=EVAL_REPO if not args.pref_sets else PREFS_REPO, # push to correct results repo
repo_type="dataset",
commit_message=f"Add reward model scores for model {args.model}",
)
Expand Down
5 changes: 3 additions & 2 deletions scripts/run_rm.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,15 @@ def main():
conv = get_conv_template(chat_template)

############################
# Load dataset from ai2-rlhf-collab/rm-benchmark-dev, "filtered" split
# Load dataset
############################
logger.info("*** Load dataset ***")
tokenizer_path = args.tokenizer if args.tokenizer else args.model
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
dataset, subsets = load_eval_dataset(
core_set=not args.pref_sets,
conv=conv,
custom_dialogue_formatting=custom_dialogue,
tokenizer=tokenizer,
logger=logger,
keep_columns=["text_chosen", "text_rejected"],
Expand Down Expand Up @@ -192,7 +193,7 @@ def main():
# first, handle custom pipelines that we must batch normally
if not args.direct_load or pipeline_builder == pipeline:
logger.info("*** Running forward pass via built in pipeline abstraction ***")
# this setup can be optimized slightly with one pipeline call, I just find the logic here more failsafe on correct indexing
# this setup can be optimized slightly with one pipeline call
# prepare for inference
reward_pipe = accelerator.prepare(reward_pipe)

Expand Down
57 changes: 15 additions & 42 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,49 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re

from setuptools import find_packages, setup

_deps = [
"accelerate",
"bitsandbytes",
"black==23.1.0",
"datasets",
"flake8>=6.0",
"fschat[model_worker,webui]",
"huggingface_hub",
"isort>=5.12.0",
"pytest",
"scipy",
"tokenizers",
"transformers",
"trl>=0.7.7",
]
deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ \[\]]+)(?:\[[^\]]+\])?(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)}


def deps_list(*pkgs):
return [deps[pkg] for pkg in pkgs]


extras = {}
extras["quality"] = deps_list("black", "isort", "flake8")
extras["tests"] = deps_list("pytest")

install_requires = [
deps["accelerate"],
deps["bitsandbytes"],
deps["datasets"],
deps["fschat"],
deps["huggingface_hub"],
deps["scipy"],
deps["tokenizers"],
deps["transformers"],
deps["trl"],
]


setup(
name="herm",
version="0.1.0.dev",
Expand All @@ -71,6 +31,19 @@ def deps_list(*pkgs):
"Operating System :: OS Independent",
],
python_requires=">=3.10",
package_dir={"": "herm"},
install_requires=install_requires,
install_requires=[
"accelerate",
"bitsandbytes",
"black==23.1.0",
"datasets",
"flake8>=6.0",
"fschat[model_worker,webui]",
"huggingface_hub",
"isort>=5.12.0",
"pytest",
"scipy",
"tokenizers",
"transformers",
"trl>=0.7.7",
],
)
2 changes: 1 addition & 1 deletion tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from fastchat.conversation import get_conv_template
from transformers import AutoTokenizer

from herm import load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer
from herm import prepare_dialogue, prepare_dialogue_from_tokenizer


class PrepareDialoguesTest(unittest.TestCase):
Expand Down

0 comments on commit 9c849cc

Please sign in to comment.