Skip to content

Commit

Permalink
Upgrade Transformers to v4.47.x (#776)
Browse files Browse the repository at this point in the history
Co-authored-by: Leon Engländer <leon.englaender@gmail.com>
  • Loading branch information
calpt and lenglaender authored Jan 8, 2025
1 parent d6054cb commit 7c2357f
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 47 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/adapter_docs_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
fetch-depth: 0
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: "3.10"
- name: Install
run: |
pip install setuptools==57.4.0
Expand Down
16 changes: 8 additions & 8 deletions .github/workflows/tests_torch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ jobs:
submodules: true
- uses: actions/setup-python@v2
with:
python-version: 3.8
- uses: actions/cache@v2
python-version: "3.10"
- uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
Expand All @@ -53,8 +53,8 @@ jobs:
submodules: true
- uses: actions/setup-python@v2
with:
python-version: 3.8
- uses: actions/cache@v2
python-version: "3.10"
- uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
Expand All @@ -76,8 +76,8 @@ jobs:
submodules: true
- uses: actions/setup-python@v2
with:
python-version: 3.8
- uses: actions/cache@v2
python-version: "3.10"
- uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
Expand All @@ -99,8 +99,8 @@ jobs:
submodules: true
- uses: actions/setup-python@v2
with:
python-version: 3.8
- uses: actions/cache@v2
python-version: "3.10"
- uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
Expand Down
2 changes: 1 addition & 1 deletion hf_transformers
Submodule hf_transformers updated 679 files
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
"timeout-decorator",
"torch",
"torchvision",
"transformers~=4.46.3",
"transformers~=4.47.1",
]


Expand Down
63 changes: 38 additions & 25 deletions src/adapters/models/deberta/modeling_deberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.models.deberta.modeling_deberta import (
DebertaOutput,
DebertaSelfOutput,
DisentangledSelfAttention,
XSoftmax,
scaled_size_sqrt,
)

from ...composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
Expand Down Expand Up @@ -95,71 +96,83 @@ def forward(
"""
# >>> START AH Changes <<<
attention_mask = prefix_attention_mask(attention_mask, dim=3, prefix_value=1) # type: ignore
attention_mask = prefix_attention_mask(attention_mask, dim=2, prefix_value=1) # type: ignore
# >>> END AH Changes <<<

if query_states is None:
qp = self.in_proj(hidden_states) # .split(self.all_head_size, dim=-1)
query_layer, key_layer, value_layer = self.transpose_for_scores(qp).chunk(3, dim=-1)
else:

def linear(w, b, x):
if b is not None:
return torch.matmul(x, w.t()) + b.t()
else:
return torch.matmul(x, w.t()) # + b.t()

ws = self.in_proj.weight.chunk(self.num_attention_heads * 3, dim=0)
qkvw = [torch.cat([ws[i * 3 + k] for i in range(self.num_attention_heads)], dim=0) for k in range(3)]
qkvb = [None] * 3

q = linear(qkvw[0], qkvb[0], query_states.to(dtype=qkvw[0].dtype))
k, v = [linear(qkvw[i], qkvb[i], hidden_states.to(dtype=qkvw[i].dtype)) for i in range(1, 3)]
q = torch.matmul(qkvw[0], query_states.t().to(dtype=qkvw[0].dtype))
k = torch.matmul(qkvw[1], hidden_states.t().to(dtype=qkvw[1].dtype))
v = torch.matmul(qkvw[2], hidden_states.t().to(dtype=qkvw[2].dtype))
query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q, k, v]]

# >>> START AH Changes <<<
query_layer, key_layer, value_layer = match_attn_matrices_for_parallel(query_layer, key_layer, value_layer)
(attention_mask,) = adjust_tensors_for_parallel(query_layer, attention_mask)
# >>> END AH Changes <<<

query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :])
value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :])

# >>> START AH Changes <<<
orig_key_layer = key_layer # save this for relative attention
key_layer, value_layer, attention_mask = self.prefix_tuning(
key_layer, value_layer, hidden_states, attention_mask, False
)
(query_layer, orig_key_layer) = adjust_tensors_for_parallel(key_layer, query_layer, orig_key_layer)
# >>> END AH Changes <<<

rel_att = None
rel_att: int = 0
# Take the dot product between "query" and "key" to get the raw attention scores.
scale_factor = 1 + len(self.pos_att_type)
scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
scale = scaled_size_sqrt(query_layer, scale_factor)
query_layer = query_layer / scale.to(dtype=query_layer.dtype)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.relative_attention:

if self.relative_attention and rel_embeddings is not None and relative_pos is not None:
rel_embeddings = self.pos_dropout(rel_embeddings)
# >>> START AH Changes <<<
rel_att = self.disentangled_att_bias(
query_layer, orig_key_layer, relative_pos, rel_embeddings, scale_factor
)
# >>> END AH Changes <<<

if rel_att is not None:
rel_att_padded = torch.zeros_like(attention_scores)
rel_att_padded[:, :, :, -rel_att.size(-1) :] = rel_att
attention_scores = attention_scores + rel_att_padded
# >>> START AH Changes <<<
# rel_att is set to 0 by default, i.e. rel_att is always not None (don't know why HuggingFace does this).
# Hence, we must check whether rel_att is a tensor and if so, pad it with zeros to be able to add it to attention_scores.
if isinstance(rel_att, torch.Tensor):
rel_att_padded = torch.zeros_like(attention_scores)
rel_att_padded[:, :, :, -rel_att.size(-1) :] = rel_att
attention_scores = attention_scores + rel_att_padded
else:
attention_scores = attention_scores + rel_att
# >>> END AH Changes <<<

# bxhxlxd
if self.talking_head:
if self.head_logits_proj is not None:
attention_scores = self.head_logits_proj(attention_scores.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)

attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
attention_mask = attention_mask.bool()
attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min)
# bsz x height x length x dimension
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs.masked_fill(attention_mask, 0)

attention_probs = self.dropout(attention_probs)
if self.talking_head:
if self.head_weights_proj is not None:
attention_probs = self.head_weights_proj(attention_probs.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)

context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (-1,)
context_layer = context_layer.view(new_context_layer_shape)
if output_attentions:
return (context_layer, attention_probs)
else:
return context_layer
if not output_attentions:
return (context_layer, None)
return (context_layer, attention_probs)
41 changes: 30 additions & 11 deletions src/adapters/models/deberta_v2/modeling_deberta_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.models.deberta_v2.modeling_deberta_v2 import (
DebertaV2Output,
DebertaV2SelfOutput,
DisentangledSelfAttention,
XSoftmax,
scaled_size_sqrt,
)

from ...composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
Expand Down Expand Up @@ -90,11 +91,15 @@ def forward(
The embedding of relative distances. It's a tensor of shape [\\(2 \\times
\\text{max_relative_positions}\\), *hidden_size*].
"""
# >>> START AH Changes <<<
attention_mask = prefix_attention_mask(attention_mask, dim=3, prefix_value=1) # type: ignore
attention_mask = prefix_attention_mask(attention_mask, dim=2, prefix_value=1) # type: ignore
# >>> END AH Changes <<<

if query_states is None:
query_states = hidden_states

# >>> START AH Changes <<<
query_layer = self.transpose_for_scores_extended(self.query_proj(query_states), self.num_attention_heads)
key_layer = self.transpose_for_scores_extended(self.key_proj(hidden_states), self.num_attention_heads)
value_layer = self.transpose_for_scores_extended(self.value_proj(hidden_states), self.num_attention_heads)
Expand All @@ -112,6 +117,7 @@ def forward(
key_layer = key_layer.contiguous().view(-1, key_layer.size(2), key_layer.size(-1))
value_layer = value_layer.contiguous().view(-1, value_layer.size(2), value_layer.size(-1))
orig_key_layer = orig_key_layer.contiguous().view(-1, orig_key_layer.size(2), orig_key_layer.size(-1))
# >>> END AH Changes <<<

rel_att = None
# Take the dot product between "query" and "key" to get the raw attention scores.
Expand All @@ -120,25 +126,39 @@ def forward(
scale_factor += 1
if "p2c" in self.pos_att_type:
scale_factor += 1
scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor)
attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale.to(dtype=query_layer.dtype)
scale = scaled_size_sqrt(query_layer, scale_factor)
attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype))
if self.relative_attention:
rel_embeddings = self.pos_dropout(rel_embeddings)
# >>> START AH Changes <<<
rel_att = self.disentangled_attention_bias(
query_layer, orig_key_layer, relative_pos, rel_embeddings, scale_factor
)
# >>> END AH Changes <<<

if rel_att is not None:
rel_att_padded = torch.zeros_like(attention_scores)
rel_att_padded[:, :, -rel_att.size(2) :] = rel_att
attention_scores = attention_scores + rel_att_padded
# >>> START AH Changes <<<
# rel_att is set to 0 by default, i.e. rel_att is always not None (don't know why HuggingFace does this).
# Hence, we must check whether rel_att is a tensor and if so, pad it with zeros to be able to add it to attention_scores.
if isinstance(rel_att, torch.Tensor):
rel_att_padded = torch.zeros_like(attention_scores)
rel_att_padded[:, :, -rel_att.size(2) :] = rel_att
attention_scores = attention_scores + rel_att_padded
else:
attention_scores = attention_scores + rel_att
# >>> END AH Changes <<<

attention_scores = attention_scores
attention_scores = attention_scores.view(
-1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1)
)

attention_mask = attention_mask.bool()
attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min)
# bsz x height x length x dimension
attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs.masked_fill(attention_mask, 0)

attention_probs = self.dropout(attention_probs)
context_layer = torch.bmm(
attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer
Expand All @@ -150,7 +170,6 @@ def forward(
)
new_context_layer_shape = context_layer.size()[:-2] + (-1,)
context_layer = context_layer.view(new_context_layer_shape)
if output_attentions:
return (context_layer, attention_probs)
else:
return context_layer
if not output_attentions:
return (context_layer, None)
return (context_layer, attention_probs)

0 comments on commit 7c2357f

Please sign in to comment.