forked from jsksxs360/How-to-use-Transformers
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
1,337,208 additions
and
0 deletions.
There are no files selected for viewing
112,188 changes: 112,188 additions & 0 deletions
112,188
data/china-people-daily-ner-corpus/example.dev
Large diffs are not rendered by default.
Oops, something went wrong.
223,833 changes: 223,833 additions & 0 deletions
223,833
data/china-people-daily-ner-corpus/example.test
Large diffs are not rendered by default.
Oops, something went wrong.
1,000,044 changes: 1,000,044 additions & 0 deletions
1,000,044
data/china-people-daily-ner-corpus/example.train
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import argparse | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
|
||
# Required parameters | ||
parser.add_argument("--output_dir", default=None, type=str, required=True, | ||
help="The output directory where the model checkpoints and predictions will be written.", | ||
) | ||
parser.add_argument("--train_file", default=None, type=str, required=True, help="The input training file.") | ||
parser.add_argument("--dev_file", default=None, type=str, required=True, help="The input evaluation file.") | ||
parser.add_argument("--test_file", default=None, type=str, required=True, help="The input testing file.") | ||
|
||
parser.add_argument("--model_type", | ||
default="bert", type=str, required=True | ||
) | ||
parser.add_argument("--model_checkpoint", | ||
default="bert-large-cased/", type=str, required=True, | ||
help="Path to pretrained model or model identifier from huggingface.co/models", | ||
) | ||
parser.add_argument("--max_seq_length", default=512, type=int, required=True) | ||
|
||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.") | ||
parser.add_argument("--do_test", action="store_true", help="Whether to run eval on the test set.") | ||
parser.add_argument("--do_predict", action="store_true", help="Whether to save predicted labels.") | ||
|
||
# Other parameters | ||
parser.add_argument("--use_ffnn_layer", action="store_true", help="Whether add FFNN before classifier.") | ||
parser.add_argument("--ffnn_size", default=-1, type=int, help="The size of mlp layer.") | ||
|
||
parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") | ||
parser.add_argument("--crf_learning_rate", default=5e-5, type=float, help="The initial learning rate for crf.") | ||
parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") | ||
parser.add_argument("--batch_size", default=4, type=int) | ||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") | ||
|
||
parser.add_argument("--adam_beta1", default=0.9, type=float, | ||
help="Epsilon for Adam optimizer." | ||
) | ||
parser.add_argument("--adam_beta2", default=0.98, type=float, | ||
help="Epsilon for Adam optimizer." | ||
) | ||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, | ||
help="Epsilon for Adam optimizer." | ||
) | ||
parser.add_argument("--warmup_proportion", default=0.1, type=float, | ||
help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training." | ||
) | ||
parser.add_argument("--weight_decay", default=0.01, type=float, | ||
help="Weight decay if we apply some." | ||
) | ||
args = parser.parse_args() | ||
return args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from torch.utils.data import Dataset, DataLoader | ||
import numpy as np | ||
|
||
CATEGORIES = ['LOC', 'ORG', 'PER'] | ||
|
||
class PeopleDaily(Dataset): | ||
def __init__(self, data_file): | ||
self.data = self.load_data(data_file) | ||
|
||
def load_data(self, data_file): | ||
Data = {} | ||
with open(data_file, 'rt', encoding='utf-8') as f: | ||
for idx, line in enumerate(f.read().split('\n\n')): | ||
if not line: | ||
break | ||
sentence, labels = '', [] | ||
for i, item in enumerate(line.split('\n')): | ||
char, tag = item.split(' ') | ||
sentence += char | ||
if tag.startswith('B'): | ||
labels.append([i, i, char, tag[2:]]) # Remove the B- or I- | ||
elif tag.startswith('I'): | ||
labels[-1][1] = i | ||
labels[-1][2] += char | ||
Data[idx] = { | ||
'sentence': sentence, | ||
'labels': labels | ||
} | ||
return Data | ||
|
||
def __len__(self): | ||
return len(self.data) | ||
|
||
def __getitem__(self, idx): | ||
return self.data[idx] | ||
|
||
def get_dataLoader(args, dataset, tokenizer, batch_size=None, shuffle=False): | ||
|
||
def collote_fn(batch_samples): | ||
batch_sentence, batch_labels = [], [] | ||
for sample in batch_samples: | ||
batch_sentence.append(sample['sentence']) | ||
batch_labels.append(sample['labels']) | ||
batch_inputs = tokenizer( | ||
batch_sentence, | ||
max_length=args.max_seq_length, | ||
padding=True, | ||
truncation=True, | ||
return_tensors="pt" | ||
) | ||
batch_label = np.zeros(batch_inputs['input_ids'].shape, dtype=int) | ||
for s_idx, sentence in enumerate(batch_sentence): | ||
encoding = tokenizer(sentence, max_length=args.max_seq_length, truncation=True) | ||
for char_start, char_end, _, tag in batch_labels[s_idx]: | ||
token_start = encoding.char_to_token(char_start) | ||
token_end = encoding.char_to_token(char_end) | ||
if not token_start or not token_end: | ||
continue | ||
batch_label[s_idx][token_start] = args.label2id[f"B-{tag}"] | ||
batch_label[s_idx][token_start+1:token_end+1] = args.label2id[f"I-{tag}"] | ||
return { | ||
'batch_inputs': batch_inputs, | ||
'labels': batch_label | ||
} | ||
|
||
return DataLoader(dataset, batch_size=(batch_size if batch_size else args.batch_size), shuffle=shuffle, | ||
collate_fn=collote_fn) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from torch import nn | ||
from torch.nn import CrossEntropyLoss | ||
from transformers import BertPreTrainedModel, BertModel | ||
from ..tools import FullyConnectedLayer, CRF | ||
|
||
class BertForNER(BertPreTrainedModel): | ||
def __init__(self, config, args): | ||
super().__init__(config) | ||
self.num_labels = args.num_labels | ||
self.bert = BertModel(config, add_pooling_layer=False) | ||
self.use_ffnn_layer = args.use_ffnn_layer | ||
if self.use_ffnn_layer: | ||
self.ffnn_size = args.ffnn_size if args.ffnn_size != -1 else config.hidden_size | ||
self.mlp = FullyConnectedLayer(config, config.hidden_size, self.ffnn_size, config.hidden_dropout_prob) | ||
else: | ||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | ||
self.classifier = nn.Linear(self.ffnn_size if args.use_ffnn_layer else config.hidden_size, self.num_labels) | ||
self.post_init() | ||
|
||
def forward(self, batch_inputs, labels=None): | ||
bert_output = self.bert(**batch_inputs) | ||
sequence_output = bert_output.last_hidden_state | ||
if self.use_ffnn_layer: | ||
sequence_output = self.mlp(sequence_output) | ||
else: | ||
sequence_output = self.dropout(sequence_output) | ||
logits = self.classifier(sequence_output) | ||
|
||
loss = None | ||
if labels is not None: | ||
loss_fct = CrossEntropyLoss() | ||
# Only keep active parts of the loss | ||
attention_mask = batch_inputs.get('attention_mask') | ||
if attention_mask is not None: | ||
active_loss = attention_mask.view(-1) == 1 | ||
active_logits = logits.view(-1, self.num_labels)[active_loss] | ||
active_labels = labels.view(-1)[active_loss] | ||
loss = loss_fct(active_logits, active_labels) | ||
else: | ||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | ||
return loss, logits | ||
|
||
class BertCrfForNER(BertPreTrainedModel): | ||
def __init__(self, config, args): | ||
super().__init__(config) | ||
self.num_labels = args.num_labels | ||
self.bert = BertModel(config, add_pooling_layer=False) | ||
self.use_ffnn_layer = args.use_ffnn_layer | ||
if self.use_ffnn_layer: | ||
self.ffnn_size = args.ffnn_size if args.ffnn_size != -1 else config.hidden_size | ||
self.mlp = FullyConnectedLayer(config, config.hidden_size, self.ffnn_size, config.hidden_dropout_prob) | ||
else: | ||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | ||
self.classifier = nn.Linear(self.ffnn_size if args.use_ffnn_layer else config.hidden_size, self.num_labels) | ||
self.crf = CRF(num_tags=self.num_labels, batch_first=True) | ||
self.post_init() | ||
|
||
def forward(self, batch_inputs, labels=None): | ||
bert_output = self.bert(**batch_inputs) | ||
sequence_output = bert_output.last_hidden_state | ||
if self.use_ffnn_layer: | ||
sequence_output = self.mlp(sequence_output) | ||
else: | ||
sequence_output = self.dropout(sequence_output) | ||
logits = self.classifier(sequence_output) | ||
|
||
loss = None | ||
if labels is not None: | ||
loss = -1 * self.crf(emissions=logits, tags=labels, mask=batch_inputs.get('attention_mask')) | ||
return loss, logits |
Oops, something went wrong.