Skip to content

Commit

Permalink
add ner model
Browse files Browse the repository at this point in the history
  • Loading branch information
jsksxs360 committed Sep 29, 2022
1 parent ea73b5e commit cb731b6
Show file tree
Hide file tree
Showing 10 changed files with 1,337,208 additions and 0 deletions.
112,188 changes: 112,188 additions & 0 deletions data/china-people-daily-ner-corpus/example.dev

Large diffs are not rendered by default.

223,833 changes: 223,833 additions & 0 deletions data/china-people-daily-ner-corpus/example.test

Large diffs are not rendered by default.

1,000,044 changes: 1,000,044 additions & 0 deletions data/china-people-daily-ner-corpus/example.train

Large diffs are not rendered by default.

53 changes: 53 additions & 0 deletions src/sequence_labeling_ner_cpd/arg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import argparse

def parse_args():
parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model checkpoints and predictions will be written.",
)
parser.add_argument("--train_file", default=None, type=str, required=True, help="The input training file.")
parser.add_argument("--dev_file", default=None, type=str, required=True, help="The input evaluation file.")
parser.add_argument("--test_file", default=None, type=str, required=True, help="The input testing file.")

parser.add_argument("--model_type",
default="bert", type=str, required=True
)
parser.add_argument("--model_checkpoint",
default="bert-large-cased/", type=str, required=True,
help="Path to pretrained model or model identifier from huggingface.co/models",
)
parser.add_argument("--max_seq_length", default=512, type=int, required=True)

parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_test", action="store_true", help="Whether to run eval on the test set.")
parser.add_argument("--do_predict", action="store_true", help="Whether to save predicted labels.")

# Other parameters
parser.add_argument("--use_ffnn_layer", action="store_true", help="Whether add FFNN before classifier.")
parser.add_argument("--ffnn_size", default=-1, type=int, help="The size of mlp layer.")

parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--crf_learning_rate", default=5e-5, type=float, help="The initial learning rate for crf.")
parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--batch_size", default=4, type=int)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

parser.add_argument("--adam_beta1", default=0.9, type=float,
help="Epsilon for Adam optimizer."
)
parser.add_argument("--adam_beta2", default=0.98, type=float,
help="Epsilon for Adam optimizer."
)
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
help="Epsilon for Adam optimizer."
)
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training."
)
parser.add_argument("--weight_decay", default=0.01, type=float,
help="Weight decay if we apply some."
)
args = parser.parse_args()
return args
67 changes: 67 additions & 0 deletions src/sequence_labeling_ner_cpd/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from torch.utils.data import Dataset, DataLoader
import numpy as np

CATEGORIES = ['LOC', 'ORG', 'PER']

class PeopleDaily(Dataset):
def __init__(self, data_file):
self.data = self.load_data(data_file)

def load_data(self, data_file):
Data = {}
with open(data_file, 'rt', encoding='utf-8') as f:
for idx, line in enumerate(f.read().split('\n\n')):
if not line:
break
sentence, labels = '', []
for i, item in enumerate(line.split('\n')):
char, tag = item.split(' ')
sentence += char
if tag.startswith('B'):
labels.append([i, i, char, tag[2:]]) # Remove the B- or I-
elif tag.startswith('I'):
labels[-1][1] = i
labels[-1][2] += char
Data[idx] = {
'sentence': sentence,
'labels': labels
}
return Data

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
return self.data[idx]

def get_dataLoader(args, dataset, tokenizer, batch_size=None, shuffle=False):

def collote_fn(batch_samples):
batch_sentence, batch_labels = [], []
for sample in batch_samples:
batch_sentence.append(sample['sentence'])
batch_labels.append(sample['labels'])
batch_inputs = tokenizer(
batch_sentence,
max_length=args.max_seq_length,
padding=True,
truncation=True,
return_tensors="pt"
)
batch_label = np.zeros(batch_inputs['input_ids'].shape, dtype=int)
for s_idx, sentence in enumerate(batch_sentence):
encoding = tokenizer(sentence, max_length=args.max_seq_length, truncation=True)
for char_start, char_end, _, tag in batch_labels[s_idx]:
token_start = encoding.char_to_token(char_start)
token_end = encoding.char_to_token(char_end)
if not token_start or not token_end:
continue
batch_label[s_idx][token_start] = args.label2id[f"B-{tag}"]
batch_label[s_idx][token_start+1:token_end+1] = args.label2id[f"I-{tag}"]
return {
'batch_inputs': batch_inputs,
'labels': batch_label
}

return DataLoader(dataset, batch_size=(batch_size if batch_size else args.batch_size), shuffle=shuffle,
collate_fn=collote_fn)
70 changes: 70 additions & 0 deletions src/sequence_labeling_ner_cpd/modeling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers import BertPreTrainedModel, BertModel
from ..tools import FullyConnectedLayer, CRF

class BertForNER(BertPreTrainedModel):
def __init__(self, config, args):
super().__init__(config)
self.num_labels = args.num_labels
self.bert = BertModel(config, add_pooling_layer=False)
self.use_ffnn_layer = args.use_ffnn_layer
if self.use_ffnn_layer:
self.ffnn_size = args.ffnn_size if args.ffnn_size != -1 else config.hidden_size
self.mlp = FullyConnectedLayer(config, config.hidden_size, self.ffnn_size, config.hidden_dropout_prob)
else:
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(self.ffnn_size if args.use_ffnn_layer else config.hidden_size, self.num_labels)
self.post_init()

def forward(self, batch_inputs, labels=None):
bert_output = self.bert(**batch_inputs)
sequence_output = bert_output.last_hidden_state
if self.use_ffnn_layer:
sequence_output = self.mlp(sequence_output)
else:
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)

loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
attention_mask = batch_inputs.get('attention_mask')
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return loss, logits

class BertCrfForNER(BertPreTrainedModel):
def __init__(self, config, args):
super().__init__(config)
self.num_labels = args.num_labels
self.bert = BertModel(config, add_pooling_layer=False)
self.use_ffnn_layer = args.use_ffnn_layer
if self.use_ffnn_layer:
self.ffnn_size = args.ffnn_size if args.ffnn_size != -1 else config.hidden_size
self.mlp = FullyConnectedLayer(config, config.hidden_size, self.ffnn_size, config.hidden_dropout_prob)
else:
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(self.ffnn_size if args.use_ffnn_layer else config.hidden_size, self.num_labels)
self.crf = CRF(num_tags=self.num_labels, batch_first=True)
self.post_init()

def forward(self, batch_inputs, labels=None):
bert_output = self.bert(**batch_inputs)
sequence_output = bert_output.last_hidden_state
if self.use_ffnn_layer:
sequence_output = self.mlp(sequence_output)
else:
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)

loss = None
if labels is not None:
loss = -1 * self.crf(emissions=logits, tags=labels, mask=batch_inputs.get('attention_mask'))
return loss, logits
Loading

0 comments on commit cb731b6

Please sign in to comment.