aso538
diff --git a/‎README.md
+7-3 b/‎README.md
+7-3
diff --git a/‎palm_rlhf_pytorch/__init__.py
+3-2 b/‎palm_rlhf_pytorch/__init__.py
+3-2
diff --git a/‎palm_rlhf_pytorch/palm_rlhf_pytorch.py ‎palm_rlhf_pytorch/palm.py
+1-257 b/‎palm_rlhf_pytorch/palm_rlhf_pytorch.py ‎palm_rlhf_pytorch/palm.py
+1-257
@@ -8,15 +8,19 @@ Implementation of RLHF (Reinforcement Learning with Human Feedback) on top of th
 
 If you are interested in replicating something like ChatGPT out in the open, please consider joining <a href="https://discord.gg/xBPBXfcFHd">Laion <img alt="Join us on Discord" src="https://img.shields.io/discord/823813159592001537?color=5865F2&logo=discord&logoColor=white"></a>
 
-This repository has gone viral without my permission. Next time, if you are promoting my unfinished repositories (notice the work in progress flag) for twitter engagement or eyeballs, at least (1) do your research or (2) be totally transparent with your readers about the capacity of the repository without resorting to clickbait. (1) I was not the first, CarperAI had been working on RLHF months before, link below. (2) There is no trained model. This is just the ship and overall map. We still need millions of dollars of compute + data to sail to the correct point in high dimensional parameter space. Even then, you need professional sailors (like Robin Rombach of Stable Diffusion fame) to actually guide the ship through turbulent times to that point.
+## FAQ
+
+- Does this contain a model for inference?
+
+There is no trained model. This is just the ship and overall map. We still need millions of dollars of compute + data to sail to the correct point in high dimensional parameter space. Even then, you need professional sailors (like Robin Rombach of Stable Diffusion fame) to actually guide the ship through turbulent times to that point.
 
 ## Community
 
-<a href="https://carper.ai/">CarperAI</a> had been working on <a href="https://github.com/CarperAI/trlx">an RLHF framework</a> for large language models
+<a href="https://carper.ai/">CarperAI</a> had been working on <a href="https://github.com/CarperAI/trlx">an RLHF framework</a> for large language models for many months prior to the release of ChatGPT.
 
 <a href="https://www.youtube.com/watch?v=sswA4j_IUxg">Yannic Kilcher</a> is also working on an <a href="https://github.com/LAION-AI/Open-Assistant">open sourced implementation</a>
 
-<a href="https://www.youtube.com/watch?v=SWwQ3k-DWyo">AI Coffeebreak w/ Letitia</a> | <a href="https://www.youtube.com/watch?v=NpmnWgQgcsA">Code Emporium</a>
+<a href="https://www.youtube.com/watch?v=SWwQ3k-DWyo">AI Coffeebreak w/ Letitia</a> | <a href="https://www.youtube.com/watch?v=NpmnWgQgcsA">Code Emporium</a> | <a href="https://www.youtube.com/watch?v=_MPJ3CyDokU">Code Emporium Part 2</a>
 
 ## Appreciation
 
 
@@ -1,2 +1,3 @@
-from palm_rlhf_pytorch.palm_rlhf_pytorch import PaLM, RewardModel, ActorCritic
-from palm_rlhf_pytorch.ppo import RLHFTrainer
+from palm_rlhf_pytorch.palm import PaLM
+from palm_rlhf_pytorch.reward import RewardModel
+from palm_rlhf_pytorch.ppo import RLHFTrainer, ActorCritic
@@ -15,7 +15,7 @@
 from einops import rearrange, repeat, reduce, pack, unpack
 from einops.layers.torch import Rearrange, Reduce
 
-from palm_rlhf_pytorch.utils import top_p, top_k, masked_mean, gumbel_sample
+from palm_rlhf_pytorch.utils import top_p, top_k, masked_mean, gumbel_sample, eval_decorator
 from palm_rlhf_pytorch.lora import LoRA
 
 # functions and decorators
@@ -29,15 +29,6 @@ def default(val, d):
 def identity(t, *args, **kwargs):
     return t
 
-def eval_decorator(fn):
-    def inner(self, *args, **kwargs):
-        was_training = self.training
-        self.eval()
-        out = fn(self, *args, **kwargs)
-        self.train(was_training)
-        return out
-    return inner
-
 # normalization
 # they use layernorm without bias, something that pytorch does not offer
 
@@ -520,250 +511,3 @@ def forward(
 
         logits = rearrange(logits, 'b n c -> b c n')
         return F.cross_entropy(logits, labels, ignore_index = self.cross_entropy_ignore_index)
-
-# Reward Model - PaLM with a scalar head
-
-@beartype
-class RewardModel(nn.Module):
-    def __init__(
-        self,
-        palm: PaLM,
-        dropout = 0.1,
-        num_binned_output = 0.,
-        use_lora = True,
-        lora_r = 8,
-        reward_lora_scope = 'reward',
-    ):
-        super().__init__()
-
-        self.palm = copy.deepcopy(palm)
-        self.palm.set_dropout(dropout)
-
-        self.reward_lora_scope = reward_lora_scope if use_lora else None
-
-        if exists(self.reward_lora_scope):
-            self.palm.add_finetune_params(reward_lora_scope, lora_r = lora_r)
-
-        dim = palm.dim
-
-        self.binned_output = num_binned_output > 1
-
-        self.prompt_embed = nn.Parameter(torch.zeros(1, 1, dim))
-        self.response_embed = nn.Parameter(torch.zeros(1, 1, dim))
-
-        if self.binned_output:
-            self.to_pred = nn.Linear(dim, num_binned_output)
-        else:
-            self.to_pred = nn.Sequential(
-                nn.Linear(dim, 1, bias = False),
-                Rearrange('... 1 -> ...')
-            )
-
-    def load(self, path):
-        path = Path(path)
-        assert path.exists()
-        self.load_state_dict(torch.load(str(path)))
-
-    def finetune_parameters(self):
-        return [
-            *self.to_pred.parameters(),
-            *(self.palm.finetune_parameters(self.reward_lora_scope) if exists(self.reward_lora_scope) else self.palm.parameters())
-        ]
-
-    def forward(
-        self,
-        x,
-        mask = None,
-        prompt_mask = None,
-        labels = None,
-        sample = False,
-        sample_temperature = 1.,
-        disable_lora = False
-    ):
-        # reward model should have an understanding of which section is prompt, and which section is response
-
-        extra_embed = None
-
-        if exists(prompt_mask):
-            extra_embed = torch.where(
-                rearrange(prompt_mask, 'b n -> b n 1'),
-                self.prompt_embed,
-                self.response_embed
-            )
-
-        # get embeddings from palm
-
-        embeds = self.palm(
-            x,
-            extra_embed = extra_embed,
-            return_only_embedding = True,
-            disable_lora = disable_lora,
-            finetune_scope = self.reward_lora_scope
-        )
-
-        pooled = masked_mean(embeds, mask, dim = 1)
-        pred = self.to_pred(pooled)
-
-        if sample and self.binned_output:
-            assert not exists(labels)
-            pred = gumbel_sample(pred, temperature = sample_temperature, dim = -1)
-
-        if not exists(labels):
-            return pred
-
-        if not self.binned_output:
-            return F.mse_loss(pred, labels)
-
-        return F.cross_entropy(pred, labels)
-
-# PaLM with actor and critic heads
-
-PPOActionCriticReturn = namedtuple('PPOActionCriticReturn', [
-    'actions',
-    'sequence',
-    'mask',
-    'prompt_mask',
-    'action_logits',
-    'values'
-])
-
-@beartype
-class ActorCritic(nn.Module):
-    def __init__(
-        self,
-        palm: PaLM,
-        critic_palm: Optional[PaLM] = None,
-        pooled_values = False,
-        actor_lora = True,
-        critic_lora = True,
-        actor_lora_r = 8,
-        critic_lora_r = 8,
-        actor_lora_scope = 'actor',
-        critic_lora_scope = 'critic',
-        actor_dropout = 0.,
-        critic_dropout = 0.
-    ):
-        super().__init__()
-        self.actor_palm = palm
-
-        self.critic_palm = critic_palm
-
-        if not exists(self.critic_palm):
-            self.critic_palm = copy.deepcopy(palm)
-
-        self.actor_palm.set_dropout(actor_dropout)
-        self.critic_palm.set_dropout(critic_dropout)
-
-        self.actor_lora = actor_lora
-        self.critic_lora = critic_lora
-
-        self.actor_lora_scope = actor_lora_scope if actor_lora else None
-        self.critic_lora_scope = critic_lora_scope if critic_lora else None
-
-        if self.actor_lora:
-            self.actor_palm.add_finetune_params(actor_lora_scope, lora_r = actor_lora_r)
-
-        if self.critic_lora:
-            self.critic_palm.add_finetune_params(critic_lora_scope, lora_r = critic_lora_r)
-
-        self.pooled_values = pooled_values
-        self.value_head = nn.Sequential(
-            nn.Linear(palm.dim, 1),
-            Rearrange('... 1 -> ...')
-        )
-
-        nn.init.zeros_(self.value_head[0].bias)
-        nn.init.orthogonal_(self.value_head[0].weight, gain = math.sqrt(2))
-
-    def actor_parameters(self):
-        if not self.actor_lora:
-            return self.actor_palm.parameters()
-
-        return [
-            *self.actor_palm.finetune_parameters(self.actor_lora_scope)
-        ]
-
-    def critic_parameters(self):
-        if not self.actor_lora:
-            return [*self.critic_palm.parameters(), *self.value_head.parameters()]
-
-        return [
-            *self.critic_palm.finetune_parameters(self.critic_lora_scope),
-            *self.value_head.parameters()
-        ]
-
-    @torch.no_grad()
-    @eval_decorator
-    def generate(
-        self,
-        state,
-        max_seq_len,
-        eos_token = None,
-        return_values = False,
-        **kwargs
-    ):
-        actions = self.actor_palm.generate(
-            max_seq_len,
-            prompt = state,       
-            eos_token = eos_token,     
-            finetune_scope = self.actor_lora_scope,
-            use_tqdm = True,
-            **kwargs
-        )
-
-        sequence = torch.cat((state, actions), dim = -1)
-        action_len = actions.shape[-1]
-        state_len = state.shape[-1]
-
-        prompt_mask = torch.arange(sequence.shape[-1], device = state.device) < state_len
-        prompt_mask = repeat(prompt_mask, 'n -> b n', b = sequence.shape[0])
-
-        action_mask = ~prompt_mask
-
-        mask = None
-        if exists(eos_token):
-            mask = ((sequence == eos_token).cumsum(dim = -1) == 0)
-            mask = F.pad(mask, (1, -1), value = True) # include eos token
-            action_mask &= mask
-
-        action_logits, value = self.forward(
-            sequence,
-            mask = action_mask,
-            return_values = return_values
-        )        
-
-        return PPOActionCriticReturn(
-            actions,
-            sequence,
-            mask,
-            prompt_mask,
-            action_logits,
-            value
-        )
-
-    def forward(
-        self,
-        x,
-        mask = None,
-        return_values = True
-    ):
-        action_logits = self.actor_palm(
-            x,
-            finetune_scope = self.actor_lora_scope
-        )
-
-        if not return_values:
-            return action_logits, None
-
-        critic_embeds = self.critic_palm(
-            x,
-            return_only_embedding = True,
-            finetune_scope = self.critic_lora_scope
-        )
-
-        if self.pooled_values:
-            critic_embeds = masked_mean(critic_embeds, mask, dim = 1)
-
-        values = self.value_head(critic_embeds)
-
-        return action_logits, values