Merge branch 'master' of https://github.com/kermitt2/delft

kermitt2 · Sep 12, 2020 · 709eddf · 709eddf
2 parents 8adac42 + b9dff46
commit 709eddf
Show file tree

Hide file tree

Showing 12 changed files with 311 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,6 +43,7 @@ data/models/sequenceLabelling/*scibert*
 data/models/sequenceLabelling/*biobert*
 data/models/sequenceLabelling/*-bert-*-en/
 data/models/textClassification/citations*
+
 data/models/sequenceLabelling/*0
 data/models/sequenceLabelling/*1
 data/models/sequenceLabelling/*2
@@ -53,3 +54,14 @@ data/models/sequenceLabelling/*6
 data/models/sequenceLabelling/*7
 data/models/sequenceLabelling/*8
 data/models/sequenceLabelling/*9
+
+data/models/textClassification/*0
+data/models/textClassification/*1
+data/models/textClassification/*2
+data/models/textClassification/*3
+data/models/textClassification/*4
+data/models/textClassification/*5
+data/models/textClassification/*6
+data/models/textClassification/*7
+data/models/textClassification/*8
+data/models/textClassification/*9
diff --git a/Readme.md b/Readme.md
@@ -165,7 +165,7 @@ Note that all our annotation data for sequence labelling follows the [IOB2](http
 
 We have reimplemented in DeLFT the main neural architectures for NER of the last four years and performed a reproducibility analysis of the these systems with comparable evaluation criterias. Unfortunaltely, in publications, systems are usually compared directly with reported results obtained in different settings, which can bias scores by more than 1.0 point and completely invalidate both comparison and interpretation of results.  
 
-You can read more about our reproducibility study of neural NER in this [blog article](http://science-miner.com/a-reproducibility-study-on-neural-ner/). This effort is very similar to the work of [(Yang and Zhang, 2018)](https://arxiv.org/pdf/1806.04470.pdf) (see also [NCRFpp](https://github.com/jiesutd/NCRFpp)) for a fair comparison of RNN for sequence labeling, but has also been extended to BERT. 
+You can read more about our reproducibility study of neural NER in this [blog article](http://science-miner.com/a-reproducibility-study-on-neural-ner/). This effort is similar to the work of [(Yang and Zhang, 2018)](https://arxiv.org/pdf/1806.04470.pdf) (see also [NCRFpp](https://github.com/jiesutd/NCRFpp)) but has also been extended to BERT for a fair comparison of RNN for sequence labeling, and can also be related to the motivations of [(Pressel et al., 2018)](http://aclweb.org/anthology/W18-2506) [MEAD](https://github.com/dpressel/mead-baseline). 
 
 All reported scores bellow are __f-score__ for the CoNLL-2003 NER dataset. We report first the f-score averaged over 10 training runs, and second the best f-score over these 10 training runs. All the DeLFT trained models are included in this repository. 
 

diff --git a/data/models/textClassification/software_use-with_ELMo/config.json b/data/models/textClassification/software_use-with_ELMo/config.json
@@ -0,0 +1,19 @@
+{
+    "model_name": "software_use-with_ELMo",
+    "model_type": "gru",
+    "embeddings_name": "glove-840B",
+    "use_ELMo": true,
+    "use_BERT": false,
+    "char_embedding_size": 25,
+    "word_embedding_size": 1324,
+    "dropout": 0.5,
+    "recurrent_dropout": 0.25,
+    "maxlen": 300,
+    "use_char_feature": false,
+    "list_classes": [
+        "not_used",
+        "used"
+    ],
+    "fold_number": 1,
+    "batch_size": 20
+}
diff --git a/data/models/textClassification/software_use-with_ELMo/gru.model_weights.hdf5 b/data/models/textClassification/software_use-with_ELMo/gru.model_weights.hdf5
diff --git a/data/models/textClassification/software_use/config.json b/data/models/textClassification/software_use/config.json
@@ -0,0 +1,19 @@
+{
+    "model_name": "software_use",
+    "model_type": "gru",
+    "embeddings_name": "glove-840B",
+    "use_ELMo": false,
+    "use_BERT": false,
+    "char_embedding_size": 25,
+    "word_embedding_size": 300,
+    "dropout": 0.5,
+    "recurrent_dropout": 0.25,
+    "maxlen": 300,
+    "use_char_feature": false,
+    "list_classes": [
+        "not_used",
+        "used"
+    ],
+    "fold_number": 1,
+    "batch_size": 256
+}
diff --git a/data/models/textClassification/software_use/gru.model_weights.hdf5 b/data/models/textClassification/software_use/gru.model_weights.hdf5
diff --git a/delft/textClassification/data_generator.py b/delft/textClassification/data_generator.py
@@ -41,7 +41,7 @@ def on_epoch_end(self):
 
         # shuffle dataset at each epoch
         if self.shuffle:
-            self.x, self.y = shuffle_triple_with_view(self.x, self.y)
+            self.x, self.y, _ = shuffle_triple_with_view(self.x, self.y)
 
     def __data_generation(self, index):
         'Generates data containing batch_size samples' 

diff --git a/delft/textClassification/models.py b/delft/textClassification/models.py
@@ -694,7 +694,19 @@ def getModel(model_config, training_config):
     return model
 
 
-def train_model(model, list_classes, batch_size, max_epoch, use_roc_auc, class_weights, training_generator, validation_generator, val_y, use_ELMo=False, use_BERT=False, multiprocessing=True, callbacks=None):
+def train_model(model, 
+                list_classes, 
+                batch_size, 
+                max_epoch, 
+                use_roc_auc, 
+                class_weights, 
+                training_generator, 
+                validation_generator, 
+                val_y, 
+                use_ELMo=False, 
+                use_BERT=False, 
+                multiprocessing=True, 
+                callbacks=None):
     best_loss = -1
     best_roc_auc = -1
     best_weights = None
@@ -709,6 +721,7 @@ def train_model(model, list_classes, batch_size, max_epoch, use_roc_auc, class_w
             # worker at 0 means the training will be executed in the main thread
             nb_workers = 0 
             multiprocessing = False
+
         model.fit_generator(
             generator=training_generator,
             use_multiprocessing=multiprocessing,
@@ -823,7 +836,10 @@ def train_folds(X, y, model_config, training_config, embeddings, callbacks=None)
 
         foldModel, best_score = train_model(getModel(model_config, training_config),
                 model_config.list_classes, training_config.batch_size, max_epoch, use_roc_auc, 
-                class_weights, training_generator, validation_generator, val_y, multiprocessing=training_config.multiprocessing, callbacks=callbacks)
+                class_weights, training_generator, validation_generator, val_y, 
+                model_config.use_ELMo, 
+                model_config.use_BERT,
+                multiprocessing=training_config.multiprocessing, callbacks=callbacks)
         models.append(foldModel)
 
         #model_path = os.path.join("../data/models/textClassification/",model_name, model_type+".model{0}_weights.hdf5".format(fold_id))

diff --git a/delft/textClassification/preprocess.py b/delft/textClassification/preprocess.py
@@ -178,8 +178,6 @@ def create_examples(self, x_s, y_s=None):
             if the_class not in self.list_classes:
                 #the_class = 'other'
                 continue
-            #if the_class not in self.list_classes:
-            #    continue
             label = tokenization.convert_to_unicode(the_class)
             examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
             valid_classes[accumul] = y

diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py
@@ -1,7 +1,10 @@
 import numpy as np
 import xml
+import gzip
+import json
 from xml.sax import make_parser, handler
 import pandas as pd
+from delft.utilities.numpy import shuffle_triple_with_view
 
 
 def load_texts_and_classes(filepath):
@@ -144,7 +147,6 @@ def load_citation_sentiment_corpus(filepath):
     return np.asarray(texts), np.asarray(polarities)
 
 
-
 def load_dataseer_corpus_csv(filepath):
     """
     Load texts from the Dataseer dataset type corpus in csv format:
@@ -220,6 +222,49 @@ def map_boolean(x):
         return np.asarray(texts_list), datatypes_final, datasubtypes_final, leafdatatypes_final, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), list_classes_leafdatatypes.tolist()
 
 
+def load_software_use_corpus_json(json_gz_file_path):
+    """
+    Load texts and classes from the corresponding Softcite corpus export in gzipped json format
+
+    Classification of the software usage is binary
+
+    Returns:
+        tuple(numpy array, numpy array): 
+            texts, binary class (used/not_used)
+
+    """
+
+    texts_list = []
+    classes_list = []
+
+    with gzip.GzipFile(json_gz_file_path, 'r') as fin:
+        data = json.loads(fin.read().decode('utf-8'))
+        if not "documents" in data:
+            print("There is no usable classified text in the corpus file", json_gz_file_path)
+            return None, None 
+        for document in data["documents"]:
+            for segment in document["texts"]:
+                if "entity_spans" in segment:
+                    if not "text" in segment:
+                        continue
+                    text = segment["text"]
+                    for entity_span in segment["entity_spans"]:
+                        if entity_span["type"] == "software":
+                            texts_list.append(text)
+                            if "used" in entity_span and entity_span["used"]:
+                                classes_list.append("used")
+                            else:
+                                classes_list.append("not_used")
+    list_possible_classes = np.unique(classes_list)
+    classes_list_final = normalize_classes(classes_list, list_possible_classes)
+
+    texts_list_final = np.asarray(texts_list)
+
+    texts_list_final, classes_list_final, _ = shuffle_triple_with_view(texts_list_final, classes_list_final)
+
+    return texts_list_final, classes_list_final
+
+
 def normalize_classes(y, list_classes):
     '''
         Replace string values of classes by their index in the list of classes

diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py
@@ -6,6 +6,9 @@
 from tensorflow import set_random_seed
 set_random_seed(7)
 
+# ask tensorflow to be quiet and not print hundred lines of logs
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
+
 import datetime
 
 from delft.textClassification.config import ModelConfig, TrainingConfig
@@ -25,7 +28,6 @@
 
 from keras.utils import plot_model
 
-
 class Classifier(object):
 
     config_file = 'config.json'
@@ -148,22 +150,25 @@ def predict(self, texts, output_format='json', use_main_thread_only=False):
                     result = predict(self.model, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
             else:
                 raise (OSError('Could not find a model.'))
-        else:
-            if self.models is not None:
-                # bert model?
-                if self.model_config.model_type.find("bert") != -1:
-                    # we don't support n classifiers for BERT (would be too large)
-                    # be sure the input processor is instanciated
-                    self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
-                    result = self.models[0].predict(texts)
-                else:    
+        else:            
+            # bert model?
+            if self.model_config.model_type.find("bert") != -1:
+                # we don't support n classifiers for BERT for prediction currently 
+                # (it would be too large and too slow if loaded 10 times from file for each batch)
+                # (however it is done for eval, models are loaded 1 time for the complete dataset, not each time per batch, and we should do the same here) 
+                # be sure the input processor is instanciated
+                self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
+                #result = self.models[0].predict(texts)
+                result = self.model.predict(texts)
+            else:
+                if self.models is not None: 
                     predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                         maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                         embeddings=self.embeddings, shuffle=False)
 
                     result = predict_folds(self.models, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
-            else:
-                raise (OSError('Could not find nfolds models.'))
+                else:
+                    raise (OSError('Could not find nfolds models.'))
         if output_format is 'json':
             res = {
                 "software": "DeLFT",
@@ -188,7 +193,7 @@ def predict(self, texts, output_format='json', use_main_thread_only=False):
             return result
 
     def eval(self, x_test, y_test, use_main_thread_only=False):
-        if self.model_config.fold_number is 1:
+        if self.model_config.fold_number == 1:
             if self.model is not None:
                 # bert model?
                 if self.model_config.model_type.find("bert") != -1:
@@ -378,6 +383,7 @@ def load(self, dir_path='data/models/textClassification/'):
         if self.model_config.model_type.find("bert") != -1:
              self.model = getModel(self.model_config, self.training_config)
              self.model.load()
+             return
 
         # load embeddings
         # Do not use cache in 'production' mode