improve logging

hipe-eval · Jun 1, 2020 · 8fad114 · 8fad114
1 parent 1728210
commit 8fad114
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 13 deletions.
diff --git a/ner_evaluation/ner_eval.py b/ner_evaluation/ner_eval.py
@@ -18,7 +18,6 @@
     get_all_tags,
     column_selector,
     check_tag_selection,
-    check_spurious_tags,
 )
 
 
@@ -44,6 +43,9 @@ def __init__(self, f_true, f_pred, glueing_cols=None):
         :return: Evaluator object.
 
         """
+
+        logging.info(f"Reading system response file '{f_pred}' and gold standard '{f_true}'.")
+
         self.f_true = f_true
         self.f_pred = f_pred
 
@@ -105,7 +107,8 @@ def __init__(self, f_true, f_pred, glueing_cols=None):
         }
 
     def check_segment_mismatch(self):
-        """Assert the alignment between gold standard and the system response.
+        """
+        Assert the alignment between gold standard and the system response.
         """
 
         logging.info("Datasets imported (Gold/Predictions).")
@@ -180,11 +183,10 @@ def evaluate(
         if isinstance(columns, str):
             columns = [columns]
 
-        tags = self.set_evaluation_tags(columns, tags, eval_type)
+        logging.info(f"Evaluating column {columns} in system response file '{self.f_pred}'")
 
-        logging.info(
-            f"Evaluating system response '{self.f_pred}' on {columns} for the following tags: {tags}"
-        )
+        tags = self.set_evaluation_tags(columns, tags, eval_type)
+        logging.info(f"Evaluation on the following tags: {tags}")
 
         # Create an accumulator to store overall results
         results = deepcopy(self.metric_schema)
@@ -338,7 +340,6 @@ def compute_metrics(self, true_named_entities: list, pred_named_entities: list,
 
         # only allow alternatives in prediction file, not in gold standard
         true_named_entities = [ent[0] for ent in true_named_entities if ent[0].e_type in tags]
-        # pred_named_entities = [ent for ent in pred_named_entities if [ent[0]].e_type in tags]
         pred_named_entities = [
             ent for ent in pred_named_entities if any([e.e_type in tags for e in ent])
         ]
@@ -542,7 +543,7 @@ def set_evaluation_tags(self, columns, tags, eval_type):
             for col in columns:
                 y_pred += [column_selector(doc, col) for doc in self.pred]
         except AttributeError:
-            msg = f"The provided annotation columns {columns} are not available in both the gold standard and the system response '{self.f_pred}'."
+            msg = f"Missing columns {columns} in the system response file '{self.f_pred}' or the gold standard."
             logging.error(msg)
             raise AssertionError(msg)
 
@@ -555,10 +556,10 @@ def set_evaluation_tags(self, columns, tags, eval_type):
         elif eval_type == "nerc":
             # For NERC, only tags which are covered by the gold standard are considered
             tags = true_tags
-            check_spurious_tags(y_true, y_pred)
+            self.check_spurious_tags(y_true, y_pred, columns)
 
             if not pred_tags:
-                msg = f"There are no tags in the system response file '{self.f_pred}' for the column: {columns}"
+                msg = f"No tags in the column '{columns}' of the system response file: '{self.f_pred}'"
                 logging.error(msg)
 
         elif eval_type == "nel":
@@ -567,6 +568,24 @@ def set_evaluation_tags(self, columns, tags, eval_type):
 
         return tags
 
+    def check_spurious_tags(self, y_true: list, y_pred: list, columns: list):
+        """Log any tags of the system response which are not in the gold standard.
+
+        :param list y_true: a nested list of gold labels with the structure "[docs [sents [tokens]]]".
+        :param list y_pred: a nested list of system labels with the structure "[docs [sents [tokens]]]".
+        :return: None.
+        :rtype: None
+
+        """
+
+        tags_true = get_all_tags(y_true)
+        tags_pred = get_all_tags(y_pred)
+
+        for pred in tags_pred:
+            if pred not in tags_true:
+                msg = f"Spurious entity label '{pred}' in column {columns} of system response file: '{self.f_pred}'. As the tag is not part of the gold standard, it is ignored in the evaluation."
+                logging.error(msg)
+
 
 def find_overlap(true_range, pred_range):
     """Find the overlap between two ranges

diff --git a/ner_evaluation/utils.py b/ner_evaluation/utils.py
@@ -23,7 +23,7 @@ def __init__(self, properties: dict):
                 try:
                     v = v.upper()
                 except AttributeError:
-                    msg = f"There are empty values in column '{k}'. They get replaced by an underscore."
+                    msg = f"Empty values in column '{k}'. They get replaced by an underscore."
                     logging.warning(msg)
                     v = "_"
 
@@ -88,7 +88,7 @@ def check_tag_selection(y_cand: list, tags_ref: list):
     return clean_tags
 
 
-def check_spurious_tags(y_true: list, y_pred: list):
+def check_spurious_tags(y_true: list, y_pred: list, columns: list):
     """Log any tags of the system response which are not in the gold standard.
 
     :param list y_true: a nested list of gold labels with the structure "[docs [sents [tokens]]]".
@@ -103,7 +103,7 @@ def check_spurious_tags(y_true: list, y_pred: list):
 
     for pred in tags_pred:
         if pred not in tags_true:
-            msg = f"Spurious entity label '{pred}' in predictions. Tag is not part of the gold standard and ignored in the evaluation."
+            msg = f"Spurious entity label '{pred}' in column {columns} in system response, which is part of the gold standard. Tag is ignored in the evaluation."
             logging.error(msg)