forked from vdobrovolskii/wl-coref
-
Notifications
You must be signed in to change notification settings - Fork 3
/
config.toml
133 lines (91 loc) · 3.44 KB
/
config.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# =============================================================================
# Before you start changing anything here, read the comments.
# All of them can be found below in the "DEFAULT" section
[DEFAULT]
# The directory that contains extracted files of everything you've downloaded.
data_dir = "data"
# Train, dev and test jsonlines
train_data = "data/english_train_head.jsonlines"
dev_data = "data/english_development_head.jsonlines"
test_data = "data/english_test_head.jsonlines"
# The device where everything is to be placed. "cuda:N"/"cpu" are supported.
device = "cuda:0"
# Bert settings ======================
# Base bert model architecture and tokenizer
bert_model = "bert-large-cased"
# Controls max length of sequences passed through bert to obtain its
# contextual embeddings
# Must be less than or equal to 512
bert_window_size = 512
# General model settings =============
# Controls the dimensionality of feature embeddings
embedding_size = 20
# Controls the dimensionality of distance embeddings used by SpanPredictor
sp_embedding_size = 64
# Controls the number of spans for which anaphoricity can be scores in one
# batch. Only affects final scoring; mention extraction and rough scoring
# are less memory intensive, so they are always done in just one batch.
a_scoring_batch_size = 512
# AnaphoricityScorer FFNN parameters
hidden_size = 1024
n_hidden_layers = 1
# Mention extraction settings ========
# Mention extractor will check spans up to max_span_len words
# The default value is chosen to be big enough to hold any dev data span
max_span_len = 64
# Pruning settings ===================
# Controls how many pairs should be preserved per mention
# after applying rough scoring.
rough_k = 50
# Training settings ==================
# Controls whether to fine-tune bert_model
bert_finetune = true
# Controls the dropout rate throughout all models
dropout_rate = 0.3
# Bert learning rate (only used if bert_finetune is set)
bert_learning_rate = 1e-5
# Task learning rate
learning_rate = 3e-4
# For how many epochs the training is done
train_epochs = 20
# Controls the weight of binary cross entropy loss added to nlml loss
bce_loss_weight = 0.5
# The directory that will contain conll prediction files
conll_log_dir = "data/conll_logs"
# =============================================================================
# Extra keyword arguments to be passed to bert tokenizers of specified models
[DEFAULT.tokenizer_kwargs]
[DEFAULT.tokenizer_kwargs.roberta-large]
"add_prefix_space" = true
[DEFAULT.tokenizer_kwargs.spanbert-large-cased]
"do_lower_case" = false
[DEFAULT.tokenizer_kwargs.bert-large-cased]
"do_lower_case" = false
# =============================================================================
# The sections listed here do not need to make use of all config variables
# If a variable is omitted, its default value will be used instead
[roberta]
bert_model = "roberta-large"
[roberta_no_finetune]
bert_model = "roberta-large"
bert_finetune = false
[roberta_no_bce]
bert_model = "roberta-large"
bce_loss_weight = 0.0
[spanbert]
bert_model = "SpanBERT/spanbert-large-cased"
[spanbert_no_bce]
bert_model = "SpanBERT/spanbert-large-cased"
bce_loss_weight = 0.0
[bert]
bert_model = "bert-large-cased"
[longformer]
bert_model = "allenai/longformer-large-4096"
bert_window_size = 2048
[debug]
bert_window_size = 384
bert_finetune = false
device = "cpu:0"
[debug_gpu]
bert_window_size = 384
bert_finetune = false