Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/kermitt2/delft
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Sep 12, 2020
2 parents 8adac42 + b9dff46 commit 709eddf
Show file tree
Hide file tree
Showing 12 changed files with 311 additions and 20 deletions.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ data/models/sequenceLabelling/*scibert*
data/models/sequenceLabelling/*biobert*
data/models/sequenceLabelling/*-bert-*-en/
data/models/textClassification/citations*

data/models/sequenceLabelling/*0
data/models/sequenceLabelling/*1
data/models/sequenceLabelling/*2
Expand All @@ -53,3 +54,14 @@ data/models/sequenceLabelling/*6
data/models/sequenceLabelling/*7
data/models/sequenceLabelling/*8
data/models/sequenceLabelling/*9

data/models/textClassification/*0
data/models/textClassification/*1
data/models/textClassification/*2
data/models/textClassification/*3
data/models/textClassification/*4
data/models/textClassification/*5
data/models/textClassification/*6
data/models/textClassification/*7
data/models/textClassification/*8
data/models/textClassification/*9
2 changes: 1 addition & 1 deletion Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ Note that all our annotation data for sequence labelling follows the [IOB2](http

We have reimplemented in DeLFT the main neural architectures for NER of the last four years and performed a reproducibility analysis of the these systems with comparable evaluation criterias. Unfortunaltely, in publications, systems are usually compared directly with reported results obtained in different settings, which can bias scores by more than 1.0 point and completely invalidate both comparison and interpretation of results.

You can read more about our reproducibility study of neural NER in this [blog article](http://science-miner.com/a-reproducibility-study-on-neural-ner/). This effort is very similar to the work of [(Yang and Zhang, 2018)](https://arxiv.org/pdf/1806.04470.pdf) (see also [NCRFpp](https://github.com/jiesutd/NCRFpp)) for a fair comparison of RNN for sequence labeling, but has also been extended to BERT.
You can read more about our reproducibility study of neural NER in this [blog article](http://science-miner.com/a-reproducibility-study-on-neural-ner/). This effort is similar to the work of [(Yang and Zhang, 2018)](https://arxiv.org/pdf/1806.04470.pdf) (see also [NCRFpp](https://github.com/jiesutd/NCRFpp)) but has also been extended to BERT for a fair comparison of RNN for sequence labeling, and can also be related to the motivations of [(Pressel et al., 2018)](http://aclweb.org/anthology/W18-2506) [MEAD](https://github.com/dpressel/mead-baseline).

All reported scores bellow are __f-score__ for the CoNLL-2003 NER dataset. We report first the f-score averaged over 10 training runs, and second the best f-score over these 10 training runs. All the DeLFT trained models are included in this repository.

Expand Down
19 changes: 19 additions & 0 deletions data/models/textClassification/software_use-with_ELMo/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"model_name": "software_use-with_ELMo",
"model_type": "gru",
"embeddings_name": "glove-840B",
"use_ELMo": true,
"use_BERT": false,
"char_embedding_size": 25,
"word_embedding_size": 1324,
"dropout": 0.5,
"recurrent_dropout": 0.25,
"maxlen": 300,
"use_char_feature": false,
"list_classes": [
"not_used",
"used"
],
"fold_number": 1,
"batch_size": 20
}
Binary file not shown.
19 changes: 19 additions & 0 deletions data/models/textClassification/software_use/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"model_name": "software_use",
"model_type": "gru",
"embeddings_name": "glove-840B",
"use_ELMo": false,
"use_BERT": false,
"char_embedding_size": 25,
"word_embedding_size": 300,
"dropout": 0.5,
"recurrent_dropout": 0.25,
"maxlen": 300,
"use_char_feature": false,
"list_classes": [
"not_used",
"used"
],
"fold_number": 1,
"batch_size": 256
}
Binary file not shown.
2 changes: 1 addition & 1 deletion delft/textClassification/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def on_epoch_end(self):

# shuffle dataset at each epoch
if self.shuffle:
self.x, self.y = shuffle_triple_with_view(self.x, self.y)
self.x, self.y, _ = shuffle_triple_with_view(self.x, self.y)

def __data_generation(self, index):
'Generates data containing batch_size samples'
Expand Down
20 changes: 18 additions & 2 deletions delft/textClassification/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,19 @@ def getModel(model_config, training_config):
return model


def train_model(model, list_classes, batch_size, max_epoch, use_roc_auc, class_weights, training_generator, validation_generator, val_y, use_ELMo=False, use_BERT=False, multiprocessing=True, callbacks=None):
def train_model(model,
list_classes,
batch_size,
max_epoch,
use_roc_auc,
class_weights,
training_generator,
validation_generator,
val_y,
use_ELMo=False,
use_BERT=False,
multiprocessing=True,
callbacks=None):
best_loss = -1
best_roc_auc = -1
best_weights = None
Expand All @@ -709,6 +721,7 @@ def train_model(model, list_classes, batch_size, max_epoch, use_roc_auc, class_w
# worker at 0 means the training will be executed in the main thread
nb_workers = 0
multiprocessing = False

model.fit_generator(
generator=training_generator,
use_multiprocessing=multiprocessing,
Expand Down Expand Up @@ -823,7 +836,10 @@ def train_folds(X, y, model_config, training_config, embeddings, callbacks=None)

foldModel, best_score = train_model(getModel(model_config, training_config),
model_config.list_classes, training_config.batch_size, max_epoch, use_roc_auc,
class_weights, training_generator, validation_generator, val_y, multiprocessing=training_config.multiprocessing, callbacks=callbacks)
class_weights, training_generator, validation_generator, val_y,
model_config.use_ELMo,
model_config.use_BERT,
multiprocessing=training_config.multiprocessing, callbacks=callbacks)
models.append(foldModel)

#model_path = os.path.join("../data/models/textClassification/",model_name, model_type+".model{0}_weights.hdf5".format(fold_id))
Expand Down
2 changes: 0 additions & 2 deletions delft/textClassification/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,6 @@ def create_examples(self, x_s, y_s=None):
if the_class not in self.list_classes:
#the_class = 'other'
continue
#if the_class not in self.list_classes:
# continue
label = tokenization.convert_to_unicode(the_class)
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
valid_classes[accumul] = y
Expand Down
47 changes: 46 additions & 1 deletion delft/textClassification/reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import numpy as np
import xml
import gzip
import json
from xml.sax import make_parser, handler
import pandas as pd
from delft.utilities.numpy import shuffle_triple_with_view


def load_texts_and_classes(filepath):
Expand Down Expand Up @@ -144,7 +147,6 @@ def load_citation_sentiment_corpus(filepath):
return np.asarray(texts), np.asarray(polarities)



def load_dataseer_corpus_csv(filepath):
"""
Load texts from the Dataseer dataset type corpus in csv format:
Expand Down Expand Up @@ -220,6 +222,49 @@ def map_boolean(x):
return np.asarray(texts_list), datatypes_final, datasubtypes_final, leafdatatypes_final, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), list_classes_leafdatatypes.tolist()


def load_software_use_corpus_json(json_gz_file_path):
"""
Load texts and classes from the corresponding Softcite corpus export in gzipped json format
Classification of the software usage is binary
Returns:
tuple(numpy array, numpy array):
texts, binary class (used/not_used)
"""

texts_list = []
classes_list = []

with gzip.GzipFile(json_gz_file_path, 'r') as fin:
data = json.loads(fin.read().decode('utf-8'))
if not "documents" in data:
print("There is no usable classified text in the corpus file", json_gz_file_path)
return None, None
for document in data["documents"]:
for segment in document["texts"]:
if "entity_spans" in segment:
if not "text" in segment:
continue
text = segment["text"]
for entity_span in segment["entity_spans"]:
if entity_span["type"] == "software":
texts_list.append(text)
if "used" in entity_span and entity_span["used"]:
classes_list.append("used")
else:
classes_list.append("not_used")
list_possible_classes = np.unique(classes_list)
classes_list_final = normalize_classes(classes_list, list_possible_classes)

texts_list_final = np.asarray(texts_list)

texts_list_final, classes_list_final, _ = shuffle_triple_with_view(texts_list_final, classes_list_final)

return texts_list_final, classes_list_final


def normalize_classes(y, list_classes):
'''
Replace string values of classes by their index in the list of classes
Expand Down
32 changes: 19 additions & 13 deletions delft/textClassification/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from tensorflow import set_random_seed
set_random_seed(7)

# ask tensorflow to be quiet and not print hundred lines of logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import datetime

from delft.textClassification.config import ModelConfig, TrainingConfig
Expand All @@ -25,7 +28,6 @@

from keras.utils import plot_model


class Classifier(object):

config_file = 'config.json'
Expand Down Expand Up @@ -148,22 +150,25 @@ def predict(self, texts, output_format='json', use_main_thread_only=False):
result = predict(self.model, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
else:
raise (OSError('Could not find a model.'))
else:
if self.models is not None:
# bert model?
if self.model_config.model_type.find("bert") != -1:
# we don't support n classifiers for BERT (would be too large)
# be sure the input processor is instanciated
self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
result = self.models[0].predict(texts)
else:
else:
# bert model?
if self.model_config.model_type.find("bert") != -1:
# we don't support n classifiers for BERT for prediction currently
# (it would be too large and too slow if loaded 10 times from file for each batch)
# (however it is done for eval, models are loaded 1 time for the complete dataset, not each time per batch, and we should do the same here)
# be sure the input processor is instanciated
self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
#result = self.models[0].predict(texts)
result = self.model.predict(texts)
else:
if self.models is not None:
predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size,
maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes,
embeddings=self.embeddings, shuffle=False)

result = predict_folds(self.models, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
else:
raise (OSError('Could not find nfolds models.'))
else:
raise (OSError('Could not find nfolds models.'))
if output_format is 'json':
res = {
"software": "DeLFT",
Expand All @@ -188,7 +193,7 @@ def predict(self, texts, output_format='json', use_main_thread_only=False):
return result

def eval(self, x_test, y_test, use_main_thread_only=False):
if self.model_config.fold_number is 1:
if self.model_config.fold_number == 1:
if self.model is not None:
# bert model?
if self.model_config.model_type.find("bert") != -1:
Expand Down Expand Up @@ -378,6 +383,7 @@ def load(self, dir_path='data/models/textClassification/'):
if self.model_config.model_type.find("bert") != -1:
self.model = getModel(self.model_config, self.training_config)
self.model.load()
return

# load embeddings
# Do not use cache in 'production' mode
Expand Down
Loading

0 comments on commit 709eddf

Please sign in to comment.