remove encoder / decoder, add prompt lengths

lucidrains · lucidrains · commit 9b3caf52dc42 · 2023-01-26T11:34:19.000-08:00
diff --git a/README.md b/README.md
@@ -152,6 +152,7 @@ answer = trainer.generate(2048, prompt = prompts[0], num_samples = 10) # (<= 204
 - [ ] incorporate some learning points from Sparrow, given Letitia's video
 - [ ] simple web interface with django + htmx for collecting human feedback
 - [ ] equip with <a href="https://github.com/hazyResearch/flash-attention">the best attention</a>
+- [ ] consider <a href="https://www.anthropic.com/constitutional.pdf">RLAIF</a>
 
 ## Citations
 
@@ -185,8 +186,8 @@ answer = trainer.generate(2048, prompt = prompts[0], num_samples = 10) # (<= 204
 
 ```bibtex
 @inproceedings{Sun2022ALT,
-  title     = {A Length-Extrapolatable Transformer},
-  author    = {Yutao Sun and Li Dong and Barun Patra and Shuming Ma and Shaohan Huang and Alon Benhaim and Vishrav Chaudhary and Xia Song and Furu Wei},
-  year      = {2022}
+    title     = {A Length-Extrapolatable Transformer},
+    author    = {Yutao Sun and Li Dong and Barun Patra and Shuming Ma and Shaohan Huang and Alon Benhaim and Vishrav Chaudhary and Xia Song and Furu Wei},
+    year      = {2022}
 }
 ```
diff --git a/palm_rlhf_pytorch/reward.py b/palm_rlhf_pytorch/reward.py
@@ -74,11 +74,22 @@ def forward(
         x,
         mask = None,
         prompt_mask = None,
+        prompt_lengths = None,
         labels = None,
         sample = False,
         sample_temperature = 1.,
         disable_lora = False
     ):
+
+        assert not (exists(prompt_mask) and exists(prompt_lengths))
+
+        # derive prompt mask from prompt lengths
+
+        if exists(prompt_lengths):
+            batch, seq_len = x.shape
+            arange = torch.arange(seq_len, device = x.device)
+            prompt_mask = repeat(arange, 'n -> b n', b = batch) < rearrange(prompt_lengths, 'b -> b 1')
+
         # reward model should have an understanding of which section is prompt, and which section is response
 
         extra_embed = None
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'PaLM-rlhf-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.52',
+  version = '0.0.61',
   license='MIT',
   description = 'PaLM + Reinforcement Learning with Human Feedback - Pytorch',
   author = 'Phil Wang',