From 0037308174a5a7e8eeb117e7beb4f7eaa875e3c9 Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <ad@Aamrdeeps-MacBook-Air.local>
Date: Sat, 22 Jan 2022 13:29:17 +0530
Subject: [PATCH 01/17] add KPT code

---
 dlkp/models/ke/crf/crf.py                     | 283 ++++++++++++++
 dlkp/models/ke/crf/crf_trainer.py             | 189 ++++++++++
 dlkp/models/ke/crf/crf_utils.py               | 346 ++++++++++++++++++
 dlkp/models/ke/extraction_utils.py            |   0
 dlkp/models/ke/kpe.py                         |   0
 dlkp/models/ke/transformer/crf_models.py      |   0
 .../token_classification_models.py            |   0
 7 files changed, 818 insertions(+)
 create mode 100644 dlkp/models/ke/crf/crf.py
 create mode 100644 dlkp/models/ke/crf/crf_trainer.py
 create mode 100644 dlkp/models/ke/crf/crf_utils.py
 create mode 100644 dlkp/models/ke/extraction_utils.py
 create mode 100644 dlkp/models/ke/kpe.py
 create mode 100644 dlkp/models/ke/transformer/crf_models.py
 create mode 100644 dlkp/models/ke/transformer/token_classification_models.py

diff --git a/dlkp/models/ke/crf/crf.py b/dlkp/models/ke/crf/crf.py
new file mode 100644
index 0000000..8d5bd30
--- /dev/null
+++ b/dlkp/models/ke/crf/crf.py
@@ -0,0 +1,283 @@
+# add models having crf classification layer with option of bilstm layers
+
+from crf_utils import *
+from typing import List, Tuple, Dict, Union
+
+import torch
+
+VITERBI_DECODING = Tuple[List[int], float]  
+
+class ConditionalRandomField(torch.nn.Module):
+    """
+    This module uses the "forward-backward" algorithm to compute
+    the log-likelihood of its inputs assuming a conditional random field model.
+    See, e.g. http://www.cs.columbia.edu/~mcollins/fb.pdf
+    # Parameters
+    num_tags : `int`, required
+        The number of tags.
+    constraints : `List[Tuple[int, int]]`, optional (default = `None`)
+        An optional list of allowed transitions (from_tag_id, to_tag_id).
+        These are applied to `viterbi_tags()` but do not affect `forward()`.
+        These should be derived from `allowed_transitions` so that the
+        start and end transitions are handled correctly for your tag type.
+    include_start_end_transitions : `bool`, optional (default = `True`)
+        Whether to include the start and end transition parameters.
+    """
+
+
+    def __init__(
+        self,
+        num_tags: int,
+        label_encoding,
+        idx2tag,
+        include_start_end_transitions: bool = True,
+    ) -> None:
+        super().__init__()
+        self.num_tags = num_tags
+        constraints = allowed_transitions(label_encoding, idx2tag)
+        # transitions[i, j] is the logit for transitioning from state i to state j.
+        self.transitions = torch.nn.Parameter(torch.Tensor(num_tags, num_tags))
+
+        # _constraint_mask indicates valid transitions (based on supplied constraints).
+        # Include special start of sequence (num_tags + 1) and end of sequence tags (num_tags + 2)
+        if constraints is None:
+            # All transitions are valid.
+            constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(1.0)
+        else:
+            constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(0.0)
+            for i, j in constraints:
+                constraint_mask[i, j] = 1.0
+
+        self._constraint_mask = torch.nn.Parameter(constraint_mask, requires_grad=False)
+
+        # Also need logits for transitioning from "start" state and to "end" state.
+        self.include_start_end_transitions = include_start_end_transitions
+        if include_start_end_transitions:
+            self.start_transitions = torch.nn.Parameter(torch.Tensor(num_tags))
+            self.end_transitions = torch.nn.Parameter(torch.Tensor(num_tags))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.xavier_normal_(self.transitions)
+        if self.include_start_end_transitions:
+            torch.nn.init.normal_(self.start_transitions)
+            torch.nn.init.normal_(self.end_transitions)
+
+    def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+        """
+        Computes the (batch_size,) denominator term for the log-likelihood, which is the
+        sum of the likelihoods across all possible state sequences.
+        """
+        batch_size, sequence_length, num_tags = logits.size()
+
+        # Transpose batch size and sequence dimensions
+        mask = mask.transpose(0, 1).contiguous()
+        logits = logits.transpose(0, 1).contiguous()
+
+        # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the
+        # transitions to the initial states and the logits for the first timestep.
+        if self.include_start_end_transitions:
+            alpha = self.start_transitions.view(1, num_tags) + logits[0]
+        else:
+            alpha = logits[0]
+
+        # For each i we compute logits for the transitions from timestep i-1 to timestep i.
+        # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are
+        # (instance, current_tag, next_tag)
+        for i in range(1, sequence_length):
+            # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis.
+            emit_scores = logits[i].view(batch_size, 1, num_tags)
+            # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis.
+            transition_scores = self.transitions.view(1, num_tags, num_tags)
+            # Alpha is for the current_tag, so we broadcast along the next_tag axis.
+            broadcast_alpha = alpha.view(batch_size, num_tags, 1)
+
+            # Add all the scores together and logexp over the current_tag axis.
+            inner = broadcast_alpha + emit_scores + transition_scores
+
+            # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension
+            # of `inner`. Otherwise (mask == False) we want to retain the previous alpha.
+            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (
+                ~mask[i]
+            ).view(batch_size, 1)
+
+        # Every sequence needs to end with a transition to the stop_tag.
+        if self.include_start_end_transitions:
+            stops = alpha + self.end_transitions.view(1, num_tags)
+        else:
+            stops = alpha
+
+        # Finally we log_sum_exp along the num_tags dim, result is (batch_size,)
+        return logsumexp(stops)
+
+    def _joint_likelihood(
+        self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        """
+        Computes the numerator term for the log-likelihood, which is just score(inputs, tags)
+        """
+        batch_size, sequence_length, _ = logits.data.shape
+
+        # Transpose batch size and sequence dimensions:
+        logits = logits.transpose(0, 1).contiguous()
+        mask = mask.transpose(0, 1).contiguous()
+        tags = tags.transpose(0, 1).contiguous()
+
+        # Start with the transition scores from start_tag to the first tag in each input
+        if self.include_start_end_transitions:
+            score = self.start_transitions.index_select(0, tags[0])
+        else:
+            score = 0.0
+
+        # Add up the scores for the observed transitions and all the inputs but the last
+        # print(mask.shape, tags.shape, logits.shape, sequence_length)
+        for i in range(sequence_length - 1):
+            # Each is shape (batch_size,)
+            current_tag, next_tag = tags[i], tags[i + 1]
+            # print(current_tag, next_tag)
+            # print("tags printiiinggggg")
+            # print(current_tag, next_tag)
+            # The scores for transitioning from current_tag to next_tag
+            transition_score = self.transitions[current_tag.view(-1), next_tag.view(-1)]
+
+            # The score for using current_tag
+            emit_score = logits[i].gather(1, current_tag.view(batch_size, 1)).squeeze(1)
+            # emit_score= 0
+            # Include transition score if next element is unmasked,
+            # input_score if this element is unmasked.
+            score = score + transition_score * mask[i + 1] + emit_score * mask[i]
+
+        # Transition from last state to "stop" state. To start with, we need to find the last tag
+        # for each instance.
+        last_tag_index = mask.sum(0).long() - 1
+        last_tags = tags.gather(0, last_tag_index.view(1, batch_size)).squeeze(0)
+
+        # Compute score of transitioning to `stop_tag` from each "last tag".
+        if self.include_start_end_transitions:
+            last_transition_score = self.end_transitions.index_select(0, last_tags)
+        else:
+            last_transition_score = 0.0
+
+        # Add the last input if it's not masked.
+        last_inputs = logits[-1]  # (batch_size, num_tags)
+        last_input_score = last_inputs.gather(1, last_tags.view(-1, 1))  # (batch_size, 1)
+        last_input_score = last_input_score.squeeze()  # (batch_size,)
+
+        score = score + last_transition_score + last_input_score * mask[-1]
+
+        return score
+
+    def forward(
+        self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None
+    ) -> torch.Tensor:
+        """
+        Computes the log likelihood.
+        """
+        # mask[tags==-100]=0
+        if mask is None:
+            mask = torch.ones(*tags.size(), dtype=torch.bool)
+        else:
+            # The code below fails in weird ways if this isn't a bool tensor, so we make sure.
+            mask = mask.to(torch.bool)
+        # print("forward",inputs.shape, tags.shape, mask.shape)
+       
+        log_denominator = self._input_likelihood(inputs, mask)
+        # temp_tags= tags
+        # tags[tags==-100]=2
+        # print(tags[0])
+        log_numerator = self._joint_likelihood(inputs, tags, mask)
+        # tags[mask==0]=-100
+        return torch.sum(log_numerator - log_denominator)
+
+    def viterbi_tags(
+        self, logits: torch.Tensor, mask: torch.BoolTensor = None, top_k: int = None
+    ) -> Union[List[VITERBI_DECODING], List[List[VITERBI_DECODING]]]:
+        """
+        Uses viterbi algorithm to find most likely tags for the given inputs.
+        If constraints are applied, disallows all other transitions.
+        Returns a list of results, of the same size as the batch (one result per batch member)
+        Each result is a List of length top_k, containing the top K viterbi decodings
+        Each decoding is a tuple  (tag_sequence, viterbi_score)
+        For backwards compatibility, if top_k is None, then instead returns a flat list of
+        tag sequences (the top tag sequence for each batch item).
+        """
+        if mask is None:
+            mask = torch.ones(*logits.shape[:2], dtype=torch.bool, device=logits.device)
+
+        if top_k is None:
+            top_k = 1
+            flatten_output = True
+        else:
+            flatten_output = False
+
+        _, max_seq_length, num_tags = logits.size()
+
+        # Get the tensors out of the variables
+        logits, mask = logits.data, mask.data
+
+        # Augment transitions matrix with start and end transitions
+        start_tag = num_tags
+        end_tag = num_tags + 1
+        transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0)
+
+        # Apply transition constraints
+        constrained_transitions = self.transitions * self._constraint_mask[
+            :num_tags, :num_tags
+        ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags])
+        transitions[:num_tags, :num_tags] = constrained_transitions.data
+
+        if self.include_start_end_transitions:
+            transitions[
+                start_tag, :num_tags
+            ] = self.start_transitions.detach() * self._constraint_mask[
+                start_tag, :num_tags
+            ].data + -10000.0 * (
+                1 - self._constraint_mask[start_tag, :num_tags].detach()
+            )
+            transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[
+                :num_tags, end_tag
+            ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())
+        else:
+            transitions[start_tag, :num_tags] = -10000.0 * (
+                1 - self._constraint_mask[start_tag, :num_tags].detach()
+            )
+            transitions[:num_tags, end_tag] = -10000.0 * (
+                1 - self._constraint_mask[:num_tags, end_tag].detach()
+            )
+
+        best_paths = []
+        # Pad the max sequence length by 2 to account for start_tag + end_tag.
+        tag_sequence = torch.Tensor(max_seq_length + 2, num_tags + 2)
+
+        for prediction, prediction_mask in zip(logits, mask):
+            mask_indices = prediction_mask.nonzero(as_tuple=False).squeeze()
+            masked_prediction = torch.index_select(prediction, 0, mask_indices)
+            sequence_length = masked_prediction.shape[0]
+
+            # Start with everything totally unlikely
+            tag_sequence.fill_(-10000.0)
+            # At timestep 0 we must have the START_TAG
+            tag_sequence[0, start_tag] = 0.0
+            # At steps 1, ..., sequence_length we just use the incoming prediction
+            tag_sequence[1 : (sequence_length + 1), :num_tags] = masked_prediction
+            # And at the last timestep we must have the END_TAG
+            tag_sequence[sequence_length + 1, end_tag] = 0.0
+
+            # We pass the tags and the transitions to `viterbi_decode`.
+            viterbi_paths, viterbi_scores = viterbi_decode(
+                tag_sequence=tag_sequence[: (sequence_length + 2)],
+                transition_matrix=transitions,
+                top_k=top_k,
+            )
+            top_k_paths = []
+            for viterbi_path, viterbi_score in zip(viterbi_paths, viterbi_scores):
+                # Get rid of START and END sentinels and append.
+                viterbi_path = viterbi_path[1:-1]
+                top_k_paths.append((viterbi_path, viterbi_score.item()))
+            best_paths.append(top_k_paths)
+
+        if flatten_output:
+            return [top_k_paths[0] for top_k_paths in best_paths]
+
+        return best_paths
\ No newline at end of file
diff --git a/dlkp/models/ke/crf/crf_trainer.py b/dlkp/models/ke/crf/crf_trainer.py
new file mode 100644
index 0000000..d9b22df
--- /dev/null
+++ b/dlkp/models/ke/crf/crf_trainer.py
@@ -0,0 +1,189 @@
+from transformers import (
+   
+    Trainer,
+    set_seed,
+    
+)
+from transformers.trainer  import *
+from transformers.trainer_utils import PredictionOutput
+from torch import nn
+from torch.utils.data.dataloader import DataLoader
+# from torch.utils.data.dataset import Dataset
+# from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+class CRF_Trainer(Trainer):
+  def prediction_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> PredictionOutput:
+        """
+        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        if not isinstance(dataloader.dataset, collections.abc.Sized):
+            raise ValueError("dataset must implement __len__")
+        prediction_loss_only = (
+            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
+        )
+
+        if self.args.deepspeed and not self.args.do_train:
+            # no harm, but flagging to the user that deepspeed config is ignored for eval
+            # flagging only for when --do_train wasn't passed as only then it's redundant
+            logger.info("Detected the deepspeed argument but it will not be used for evaluation")
+
+        model = self._wrap_model(self.model, training=False)
+
+        # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while
+        # ``train`` is running, half it first and then put on device
+        if not self.is_in_train and self.args.fp16_full_eval:
+            model = model.half().to(self.args.device)
+
+        batch_size = dataloader.batch_size
+        num_examples = self.num_examples(dataloader)
+        logger.info(f"***** Running {description} *****")
+        logger.info(f"  Num examples = {num_examples}")
+        logger.info(f"  Batch size = {batch_size}")
+        losses_host: torch.Tensor = None
+        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
+
+        world_size = max(1, self.args.world_size)
+
+        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
+        if not prediction_loss_only:
+            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
+            # a batch size to the sampler)
+            make_multiple_of = None
+            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
+                make_multiple_of = dataloader.sampler.batch_size
+            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+        if self.args.past_index >= 0:
+            self._past = None
+        model.eval()
+        
+        if is_torch_tpu_available():
+            dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device)
+
+        
+
+        self.callback_handler.eval_dataloader = dataloader
+        
+        for step, inputs in enumerate(dataloader):
+
+            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            
+            best_path= self.eval_step(model, logits, inputs['attention_mask'])
+            # best_path= self.eval_step(model, logits)
+            # print(len(best_path), best_path[0])
+            # logits= torch.zeros()
+
+            best_path= [x for x,_ in best_path]
+            # print(best_path)
+            # seq_len= labels.shape[1]
+            logits*=0
+            for i,path in enumerate(best_path):
+              # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1])
+              # print(len(x))
+              for j, tag in enumerate(path):
+                logits[i,j,int(tag)]=1
+                # print(inputs['attention_mask'][i,j], labels[i,j])
+                
+            # logits= torch.tensor(data=best_path, dtype= labels.dtype, device= labels.device)
+            # if(logits.shape!=labels.shape):
+            #   print(logits.shape,labels.shape)
+            # assert logits.shape==labels.shape
+            if loss is not None:
+                losses = loss.repeat(batch_size)
+                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+            if logits is not None:
+                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+            if labels is not None:
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control)
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0:
+                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
+                if not prediction_loss_only:
+                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
+                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, labels_host = None, None, None
+
+        if self.args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of the evaluation loop
+            delattr(self, "_past")
+
+        # Gather all remaining tensors and put them back on the CPU
+        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
+        if not prediction_loss_only:
+            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
+            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+
+        eval_loss = eval_losses_gatherer.finalize()
+        preds = preds_gatherer.finalize() if not prediction_loss_only else None
+        label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
+
+        if self.compute_metrics is not None and preds is not None and label_ids is not None:
+            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+        else:
+            metrics = {}
+
+        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
+        metrics = denumpify_detensorize(metrics)
+
+        if eval_loss is not None:
+            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
+  def eval_step(self,
+        model: nn.Module,
+        logits,
+        mask= None,
+        top_k= None
+
+        ):
+        with torch.no_grad():
+          output= model.crf.viterbi_tags(logits, mask, top_k)
+
+        return output
+    
+
+  def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+
+        Subclass and override for custom behavior.
+        """
+        # if self.label_smoother is not None and "labels" in inputs:
+        #     labels = inputs.pop("labels")
+        # else:
+        labels = None
+        # print(model)
+        # assert "labels" in inputs
+        # print(type(inputs),inputs)
+        outputs = model(**inputs)
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        if labels is not None:
+            loss = self.label_smoother(outputs, labels)
+        else:
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            # print(outputs)
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+        # print("loss is ", loss)
+        return (loss, outputs) if return_outputs else loss
diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py
new file mode 100644
index 0000000..295aeef
--- /dev/null
+++ b/dlkp/models/ke/crf/crf_utils.py
@@ -0,0 +1,346 @@
+"""
+Conditional random field utilis file 
+"""
+from typing import List, Tuple, Dict, Union
+
+import torch
+
+# from allennlp.common.checks import ConfigurationError
+# import allennlp.nn.util as util
+
+VITERBI_DECODING = Tuple[List[int], float]  # a list of tags, and a viterbi score
+
+
+def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]:
+    """
+    Given labels and a constraint type, returns the allowed transitions. It will
+    additionally include transitions for the start and end states, which are used
+    by the conditional random field.
+    # Parameters
+    constraint_type : `str`, required
+        Indicates which constraint to apply. Current choices are
+        "BIO", "IOB1", "BIOUL", and "BMES".
+    labels : `Dict[int, str]`, required
+        A mapping {label_id -> label}. Most commonly this would be the value from
+        Vocabulary.get_index_to_token_vocabulary()
+    # Returns
+    `List[Tuple[int, int]]`
+        The allowed transitions (from_label_id, to_label_id).
+    """
+    num_labels = len(labels)
+    start_tag = num_labels
+    end_tag = num_labels + 1
+    labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")]
+
+    allowed = []
+    for from_label_index, from_label in labels_with_boundaries:
+        if from_label in ("START", "END"):
+            from_tag = from_label
+            from_entity = ""
+        else:
+            from_tag = from_label[0]
+            from_entity = from_label[1:]
+        for to_label_index, to_label in labels_with_boundaries:
+            if to_label in ("START", "END"):
+                to_tag = to_label
+                to_entity = ""
+            else:
+                to_tag = to_label[0]
+                to_entity = to_label[1:]
+            if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity):
+                allowed.append((from_label_index, to_label_index))
+    return allowed
+
+
+def is_transition_allowed(
+    constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str
+):
+    """
+    Given a constraint type and strings `from_tag` and `to_tag` that
+    represent the origin and destination of the transition, return whether
+    the transition is allowed under the given constraint type.
+    # Parameters
+    constraint_type : `str`, required
+        Indicates which constraint to apply. Current choices are
+        "BIO", "IOB1", "BIOUL", and "BMES".
+    from_tag : `str`, required
+        The tag that the transition originates from. For example, if the
+        label is `I-PER`, the `from_tag` is `I`.
+    from_entity : `str`, required
+        The entity corresponding to the `from_tag`. For example, if the
+        label is `I-PER`, the `from_entity` is `PER`.
+    to_tag : `str`, required
+        The tag that the transition leads to. For example, if the
+        label is `I-PER`, the `to_tag` is `I`.
+    to_entity : `str`, required
+        The entity corresponding to the `to_tag`. For example, if the
+        label is `I-PER`, the `to_entity` is `PER`.
+    # Returns
+    `bool`
+        Whether the transition is allowed under the given `constraint_type`.
+    """
+
+    if to_tag == "START" or from_tag == "END":
+        # Cannot transition into START or from END
+        return False
+
+    if constraint_type == "BIOUL":
+        if from_tag == "START":
+            return to_tag in ("O", "B", "U")
+        if to_tag == "END":
+            return from_tag in ("O", "L", "U")
+        return any(
+            [
+                # O can transition to O, B-* or U-*
+                # L-x can transition to O, B-*, or U-*
+                # U-x can transition to O, B-*, or U-*
+                from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"),
+                # B-x can only transition to I-x or L-x
+                # I-x can only transition to I-x or L-x
+                from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity,
+            ]
+        )
+    elif constraint_type == "BIO":
+        if from_tag == "START":
+            return to_tag in ("O", "B")
+        if to_tag == "END":
+            return from_tag in ("O", "B", "I")
+        return any(
+            [
+                # Can always transition to O or B-x
+                to_tag in ("O", "B"),
+                # Can only transition to I-x from B-x or I-x
+                to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity,
+            ]
+        )
+    elif constraint_type == "IOB1":
+        if from_tag == "START":
+            return to_tag in ("O", "I")
+        if to_tag == "END":
+            return from_tag in ("O", "B", "I")
+        return any(
+            [
+                # Can always transition to O or I-x
+                to_tag in ("O", "I"),
+                # Can only transition to B-x from B-x or I-x, where
+                # x is the same tag.
+                to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity,
+            ]
+        )
+    elif constraint_type == "BMES":
+        if from_tag == "START":
+            return to_tag in ("B", "S")
+        if to_tag == "END":
+            return from_tag in ("E", "S")
+        return any(
+            [
+                # Can only transition to B or S from E or S.
+                to_tag in ("B", "S") and from_tag in ("E", "S"),
+                # Can only transition to M-x from B-x, where
+                # x is the same tag.
+                to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity,
+                # Can only transition to E-x from B-x or M-x, where
+                # x is the same tag.
+                to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity,
+            ]
+        )
+    else:
+        print("error in constrint type")
+
+
+def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor:
+    """
+    A numerically stable computation of logsumexp. This is mathematically equivalent to
+    `tensor.exp().sum(dim, keep=keepdim).log()`.  This function is typically used for summing log
+    probabilities.
+    # Parameters
+    tensor : `torch.FloatTensor`, required.
+        A tensor of arbitrary size.
+    dim : `int`, optional (default = `-1`)
+        The dimension of the tensor to apply the logsumexp to.
+    keepdim: `bool`, optional (default = `False`)
+        Whether to retain a dimension of size one at the dimension we reduce over.
+    """
+    max_score, _ = tensor.max(dim, keepdim=keepdim)
+    if keepdim:
+        stable_vec = tensor - max_score
+    else:
+        stable_vec = tensor - max_score.unsqueeze(dim)
+    return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log()
+
+
+
+
+def viterbi_decode(
+    tag_sequence: torch.Tensor,
+    transition_matrix: torch.Tensor,
+    tag_observations: Optional[List[int]] = None,
+    allowed_start_transitions: torch.Tensor = None,
+    allowed_end_transitions: torch.Tensor = None,
+    top_k: int = None,
+):
+    """
+    Perform Viterbi decoding in log space over a sequence given a transition matrix
+    specifying pairwise (transition) potentials between tags and a matrix of shape
+    (sequence_length, num_tags) specifying unary potentials for possible tags per
+    timestep.
+    # Parameters
+    tag_sequence : `torch.Tensor`, required.
+        A tensor of shape (sequence_length, num_tags) representing scores for
+        a set of tags over a given sequence.
+    transition_matrix : `torch.Tensor`, required.
+        A tensor of shape (num_tags, num_tags) representing the binary potentials
+        for transitioning between a given pair of tags.
+    tag_observations : `Optional[List[int]]`, optional, (default = `None`)
+        A list of length `sequence_length` containing the class ids of observed
+        elements in the sequence, with unobserved elements being set to -1. Note that
+        it is possible to provide evidence which results in degenerate labelings if
+        the sequences of tags you provide as evidence cannot transition between each
+        other, or those transitions are extremely unlikely. In this situation we log a
+        warning, but the responsibility for providing self-consistent evidence ultimately
+        lies with the user.
+    allowed_start_transitions : `torch.Tensor`, optional, (default = `None`)
+        An optional tensor of shape (num_tags,) describing which tags the START token
+        may transition *to*. If provided, additional transition constraints will be used for
+        determining the start element of the sequence.
+    allowed_end_transitions : `torch.Tensor`, optional, (default = `None`)
+        An optional tensor of shape (num_tags,) describing which tags may transition *to* the
+        end tag. If provided, additional transition constraints will be used for determining
+        the end element of the sequence.
+    top_k : `int`, optional, (default = `None`)
+        Optional integer specifying how many of the top paths to return. For top_k>=1, returns
+        a tuple of two lists: top_k_paths, top_k_scores, For top_k==None, returns a flattened
+        tuple with just the top path and its score (not in lists, for backwards compatibility).
+    # Returns
+    viterbi_path : `List[int]`
+        The tag indices of the maximum likelihood tag sequence.
+    viterbi_score : `torch.Tensor`
+        The score of the viterbi path.
+    """
+    if top_k is None:
+        top_k = 1
+        flatten_output = True
+    elif top_k >= 1:
+        flatten_output = False
+    else:
+        raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}")
+
+    sequence_length, num_tags = list(tag_sequence.size())
+
+    has_start_end_restrictions = (
+        allowed_end_transitions is not None or allowed_start_transitions is not None
+    )
+
+    if has_start_end_restrictions:
+
+        if allowed_end_transitions is None:
+            allowed_end_transitions = torch.zeros(num_tags)
+        if allowed_start_transitions is None:
+            allowed_start_transitions = torch.zeros(num_tags)
+
+        num_tags = num_tags + 2
+        new_transition_matrix = torch.zeros(num_tags, num_tags)
+        new_transition_matrix[:-2, :-2] = transition_matrix
+
+        # Start and end transitions are fully defined, but cannot transition between each other.
+
+        allowed_start_transitions = torch.cat(
+            [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])]
+        )
+        allowed_end_transitions = torch.cat(
+            [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])]
+        )
+
+        # First define how we may transition FROM the start and end tags.
+        new_transition_matrix[-2, :] = allowed_start_transitions
+        # We cannot transition from the end tag to any tag.
+        new_transition_matrix[-1, :] = -math.inf
+
+        new_transition_matrix[:, -1] = allowed_end_transitions
+        # We cannot transition to the start tag from any tag.
+        new_transition_matrix[:, -2] = -math.inf
+
+        transition_matrix = new_transition_matrix
+
+    if tag_observations:
+        if len(tag_observations) != sequence_length:
+            raise ConfigurationError(
+                "Observations were provided, but they were not the same length "
+                "as the sequence. Found sequence of length: {} and evidence: {}".format(
+                    sequence_length, tag_observations
+                )
+            )
+    else:
+        tag_observations = [-1 for _ in range(sequence_length)]
+
+    if has_start_end_restrictions:
+        tag_observations = [num_tags - 2] + tag_observations + [num_tags - 1]
+        zero_sentinel = torch.zeros(1, num_tags)
+        extra_tags_sentinel = torch.ones(sequence_length, 2) * -math.inf
+        tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1)
+        tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0)
+        sequence_length = tag_sequence.size(0)
+
+    path_scores = []
+    path_indices = []
+
+    if tag_observations[0] != -1:
+        one_hot = torch.zeros(num_tags)
+        one_hot[tag_observations[0]] = 100000.0
+        path_scores.append(one_hot.unsqueeze(0))
+    else:
+        path_scores.append(tag_sequence[0, :].unsqueeze(0))
+
+    # Evaluate the scores for all possible paths.
+    for timestep in range(1, sequence_length):
+        # Add pairwise potentials to current scores.
+        summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix
+        summed_potentials = summed_potentials.view(-1, num_tags)
+
+        # Best pairwise potential path score from the previous timestep.
+        max_k = min(summed_potentials.size()[0], top_k)
+        scores, paths = torch.topk(summed_potentials, k=max_k, dim=0)
+
+        # If we have an observation for this timestep, use it
+        # instead of the distribution over tags.
+        observation = tag_observations[timestep]
+        # Warn the user if they have passed
+        # invalid/extremely unlikely evidence.
+        if tag_observations[timestep - 1] != -1 and observation != -1:
+            if transition_matrix[tag_observations[timestep - 1], observation] < -10000:
+                logger.warning(
+                    "The pairwise potential between tags you have passed as "
+                    "observations is extremely unlikely. Double check your evidence "
+                    "or transition potentials!"
+                )
+        if observation != -1:
+            one_hot = torch.zeros(num_tags)
+            one_hot[observation] = 100000.0
+            path_scores.append(one_hot.unsqueeze(0))
+        else:
+            path_scores.append(tag_sequence[timestep, :] + scores)
+        path_indices.append(paths.squeeze())
+
+    # Construct the most likely sequence backwards.
+    path_scores_v = path_scores[-1].view(-1)
+    max_k = min(path_scores_v.size()[0], top_k)
+    viterbi_scores, best_paths = torch.topk(path_scores_v, k=max_k, dim=0)
+    viterbi_paths = []
+    for i in range(max_k):
+        viterbi_path = [best_paths[i]]
+        for backward_timestep in reversed(path_indices):
+            viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]]))
+        # Reverse the backward path.
+        viterbi_path.reverse()
+
+        if has_start_end_restrictions:
+            viterbi_path = viterbi_path[1:-1]
+
+        # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo.
+        viterbi_path = [j % num_tags for j in viterbi_path]
+        viterbi_paths.append(viterbi_path)
+
+    if flatten_output:
+        return viterbi_paths[0], viterbi_scores[0]
+
+    return viterbi_paths, viterbi_scores
\ No newline at end of file
diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py
new file mode 100644
index 0000000..e69de29
diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
new file mode 100644
index 0000000..e69de29
diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
new file mode 100644
index 0000000..e69de29
diff --git a/dlkp/models/ke/transformer/token_classification_models.py b/dlkp/models/ke/transformer/token_classification_models.py
new file mode 100644
index 0000000..e69de29

From 0b9918a58d0f3c8e2b96c6090dc78db2867e74ef Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Sat, 22 Jan 2022 20:20:51 +0530
Subject: [PATCH 02/17] add notebooks from KPT

---
 notebooks/klm_preprocess.ipynb | 702 +++++++++++++++++++++++++++++
 notebooks/tranKP.ipynb         | 801 +++++++++++++++++++++++++++++++++
 2 files changed, 1503 insertions(+)
 create mode 100644 notebooks/klm_preprocess.ipynb
 create mode 100644 notebooks/tranKP.ipynb

diff --git a/notebooks/klm_preprocess.ipynb b/notebooks/klm_preprocess.ipynb
new file mode 100644
index 0000000..a8fa5e1
--- /dev/null
+++ b/notebooks/klm_preprocess.ipynb
@@ -0,0 +1,702 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "klm_preprocess.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "LEHkPwSWLBBD",
+        "outputId": "2adf486e-ff7b-4750-ec88-b8d0933f951d"
+      },
+      "source": [
+        "!pip install transformers\n",
+        "!pip install datasets\n"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (4.1.1)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n",
+            "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.43)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.8)\n",
+            "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n",
+            "Requirement already satisfied: tokenizers==0.9.4 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.9.4)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.19.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.12.5)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.0.0)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n",
+            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n",
+            "Requirement already satisfied: datasets in /usr/local/lib/python3.6/dist-packages (1.2.0)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from datasets) (0.3.3)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.6/dist-packages (from datasets) (1.19.4)\n",
+            "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from datasets) (0.8)\n",
+            "Requirement already satisfied: pyarrow>=0.17.1 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.0.0)\n",
+            "Requirement already satisfied: multiprocess in /usr/local/lib/python3.6/dist-packages (from datasets) (0.70.11.1)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from datasets) (1.1.5)\n",
+            "Requirement already satisfied: xxhash in /usr/local/lib/python3.6/dist-packages (from datasets) (2.0.0)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.23.0)\n",
+            "Requirement already satisfied: tqdm<4.50.0,>=4.27 in /usr/local/lib/python3.6/dist-packages (from datasets) (4.41.1)\n",
+            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2018.9)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2.8.1)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2020.12.5)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "pSvgoduMLGEp"
+      },
+      "source": [
+        "from transformers import RobertaForMaskedLM\n",
+        "from transformers import RobertaTokenizer, PreTrainedTokenizer\n",
+        "from transformers import RobertaConfig"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YoVheMQrnEmb"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "C3oHJencLGMb"
+      },
+      "source": [
+        "#config\n",
+        "config = RobertaConfig(\n",
+        "    vocab_size=52_000,\n",
+        "    max_position_embeddings=514,\n",
+        "    num_attention_heads=12,\n",
+        "    num_hidden_layers=6,\n",
+        "    type_vocab_size=1,\n",
+        ")\n",
+        "\n",
+        "#model roberta\n",
+        "model = RobertaForMaskedLM(config=config)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7yxtwm9xMYN3"
+      },
+      "source": [
+        "**Code requirement**\n",
+        "\n",
+        "\n",
+        "1.   Dataset class :\n",
+        "\n",
+        "        load and tokenize dataset->> input ids\n",
+        "\n",
+        "        *look if nlp dataset library could be used here easily*\n",
+        "\n",
+        "        tokenize key phrase as well as text and mask key phrase in data collator\n",
+        "\n",
+        "\n",
+        "\n",
+        "      \n",
+        "\n",
+        "2.   Data collator for masked LM\n",
+        "\n",
+        "    takes a list of samples from a Dataset and collate them into a batch for (also masking and stuffs)\n",
+        "\n",
+        "    Refrence class: DataCollatorForWholeWordMask\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7kKlX2aRM0k2"
+      },
+      "source": [
+        "**Dataset class**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9vE_U_IuLGP9"
+      },
+      "source": [
+        "from torch.utils.data.dataset import Dataset\n",
+        "import json, os"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "g2CG9HfpNmLs"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "jWl2QW0TNmRE"
+      },
+      "source": [
+        "class KLMDataset(Dataset):\n",
+        "\n",
+        "    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):\n",
+        "        assert os.path.isfile(file_path)\n",
+        "\n",
+        "        # logger.info(\"Creating features from dataset file at %s\", file_path)\n",
+        "        self.abst= []\n",
+        "        self.kps= []\n",
+        "        with open(file_path, encoding=\"utf-8\") as f:\n",
+        "            for line in f:\n",
+        "                d=json.loads(line)\n",
+        "                self.abst.append(d['text'])\n",
+        "                self.kps.append(d['kp'])\n",
+        "\n",
+        "        for (i,kp) in enumerate(self.kps):\n",
+        "            self.kps[i]= tokenizer(kp,add_special_tokens= False, truncation= False)['input_ids']\n",
+        " \n",
+        "\n",
+        "        self.abst = tokenizer(self.abst, add_special_tokens=True, truncation=True, max_length=block_size)[\"input_ids\"]\n",
+        "        \n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return len(self.abst)\n",
+        "\n",
+        "    def __getitem__(self, i):\n",
+        "        # print(\"called {} and results{}\\n\".format(i,{'input_ids': self.abst[i], 'kp': self.kps[i]}))\n",
+        "        return {'input_ids': self.abst[i], 'kp': self.kps[i]}\n",
+        "\n",
+        "# super daset from HF\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xs-wm7RVNmUP"
+      },
+      "source": [
+        "tok= RobertaTokenizer.from_pretrained(\"roberta-base\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "991fGSu8hW-n"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-gLTyA_0hvMU"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YIXBbF_ZYqCF"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "f5NOBJhfNmXQ"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x_GyWRwqM5KI"
+      },
+      "source": [
+        "**Data collator**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "W5FcKIbkLGS2"
+      },
+      "source": [
+        "from transformers import DataCollatorForLanguageModeling\n",
+        "from dataclasses import dataclass\n",
+        "from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union\n",
+        "import torch\n",
+        "from transformers.data.data_collator import _collate_batch, tolist\n",
+        "import random"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xZMjWP8mLGVz"
+      },
+      "source": [
+        "@dataclass\n",
+        "class DataCollatorForKLM(DataCollatorForLanguageModeling):\n",
+        "    def __init__(self, \n",
+        "        tokenizer: PreTrainedTokenizer,\n",
+        "        mlm_probability= 0.15,\n",
+        "        kp_mask_percentage = 0.8):\n",
+        "        self.tokenizer= tokenizer\n",
+        "        self.mlm_probability= mlm_probability\n",
+        "        self.kp_mask_percentage = kp_mask_percentage\n",
+        "\n",
+        "    def __call__(\n",
+        "        self, examples\n",
+        "    ) -> Dict[str, torch.Tensor]:\n",
+        "        print(\"collator   \",examples)\n",
+        "        if isinstance(examples[0], dict):\n",
+        "            print(examples[0])\n",
+        "            input_ids = [e[\"input_ids\"] for e in examples]\n",
+        "            key_phrases= [e[\"labels\"] for e in examples]\n",
+        "        else:\n",
+        "          print(\"proper inputr fromat is not found for kp input ids\")\n",
+        "  \n",
+        "\n",
+        "        batch_input = _collate_batch(input_ids, self.tokenizer)\n",
+        "\n",
+        "        mask_labels = []\n",
+        "        kp_mask_labels= []\n",
+        "        for e in examples:\n",
+        "            ref_tokens = []\n",
+        "            kp_tokens_list= []\n",
+        "            for id in tolist(e[\"input_ids\"]):\n",
+        "                token = self.tokenizer._convert_id_to_token(id)\n",
+        "                ref_tokens.append(token)\n",
+        "            for kp in tolist(e[\"labels\"]):\n",
+        "                curr_kp= []\n",
+        "                for kp_id in kp:\n",
+        "                    tok= self.tokenizer._convert_id_to_token(kp_id)\n",
+        "                    curr_kp.append(tok)\n",
+        "                if len(curr_kp) >0:\n",
+        "                    kp_tokens_list.append(curr_kp)\n",
+        "            mask_res= self.kp_and_whole_word_mask(ref_tokens, kp_tokens_list) #[[\"KP1-T1\", \"KP1-T2\"], [\"KP2-T1\", \"KP2-T2\", \"KP2-T3\"]] \n",
+        "            mask_labels.append(mask_res[0])\n",
+        "            kp_mask_labels.append(mask_res[1])\n",
+        "        #collate\n",
+        "        batch_mask = _collate_batch(mask_labels, self.tokenizer)\n",
+        "        kp_batch_mask= _collate_batch(kp_mask_labels, self.tokenizer)\n",
+        "        #mask\n",
+        "        inputs, labels = self.mask_tokens_and_kp(batch_input, batch_mask, kp_batch_mask)\n",
+        "\n",
+        "        return {\"input_ids\": inputs, \"labels\": labels}\n",
+        "\n",
+        "    def kp_and_whole_word_mask(self, input_tokens, kp_tokens_list, max_predictions=512):\n",
+        "        \"\"\"\n",
+        "        Get 0/1 labels for masked tokens with whole word mask proxy\n",
+        "        \"\"\"\n",
+        "\n",
+        "        cand_indexes = []\n",
+        "        kp_indexes= []\n",
+        "        for (i, token) in enumerate(input_tokens):\n",
+        "            if token == \"[CLS]\" or token == \"[SEP]\":\n",
+        "                continue\n",
+        "            kp_flag = False\n",
+        "            for kp in kp_tokens_list: # kp = [\"KP1-T1\", \"KP1-T2\"]\n",
+        "                j= i + len(kp)\n",
+        "                if j < len(input_tokens):\n",
+        "                    if input_tokens[i:j]== kp: # input_tokens = [\"KP1-T1\", \"KP1-T2\"]\n",
+        "                      kp_indexes.append([x for x in range(i,j)]) # kp_indexes = [\"index of KP1-T1\", \"index of KP1-T2\"]\n",
+        "                      i=j-1\n",
+        "                      kp_flag= True\n",
+        "                      break\n",
+        "            if kp_flag: #if token is included in kp mask then don't include in random token mask\n",
+        "                continue\n",
+        "            if len(cand_indexes) >= 1 and token.startswith(\"##\"):\n",
+        "                cand_indexes[-1].append(i)\n",
+        "            else:\n",
+        "                cand_indexes.append([i])\n",
+        "            \n",
+        "        tok_to_predict= min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))\n",
+        "        kp_to_predict= min(max_predictions, max(1, int(round(len(kp_tokens_list) * self.kp_mask_percentage))))\n",
+        "\n",
+        "        tok_mask_labels= self.get_mask_labels(cand_indexes=cand_indexes, len_input_tokens=len(input_tokens), num_to_predict=tok_to_predict)\n",
+        "        kp_mask_labels= self.get_mask_labels(cand_indexes=kp_indexes, len_input_tokens=len(input_tokens), num_to_predict=kp_to_predict)\n",
+        "        return tok_mask_labels, kp_mask_labels\n",
+        "\n",
+        "\n",
+        "    def get_mask_labels(self, cand_indexes, len_input_tokens, num_to_predict):\n",
+        "        random.shuffle(cand_indexes)\n",
+        "        masked_lms = []\n",
+        "        covered_indexes = set()\n",
+        "        for index_set in cand_indexes:\n",
+        "            if len(masked_lms) >= num_to_predict:\n",
+        "                break\n",
+        "            # If adding a whole-word mask would exceed the maximum number of\n",
+        "            # predictions, then just skip this candidate.\n",
+        "            if len(masked_lms) + len(index_set) > num_to_predict:\n",
+        "                continue\n",
+        "            is_any_index_covered = False\n",
+        "            for index in index_set:\n",
+        "                if index in covered_indexes:\n",
+        "                    is_any_index_covered = True\n",
+        "                    break\n",
+        "            if is_any_index_covered:\n",
+        "                continue\n",
+        "            for index in index_set:\n",
+        "                covered_indexes.add(index)\n",
+        "                masked_lms.append(index)\n",
+        "\n",
+        "        assert len(covered_indexes) == len(masked_lms)\n",
+        "        mask_labels = [1 if i in covered_indexes else 0 for i in range(len_input_tokens)]\n",
+        "        return mask_labels\n",
+        "\n",
+        "    def mask_tokens_and_kp(self, inputs, mask_labels, kp_mask_labels): \n",
+        "        \"\"\"\n",
+        "        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set\n",
+        "        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.\n",
+        "        \"\"\"\n",
+        "\n",
+        "        if self.tokenizer.mask_token is None:\n",
+        "            raise ValueError(\n",
+        "                \"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer.\"\n",
+        "            )\n",
+        "        labels = inputs.clone()\n",
+        "        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)\n",
+        "\n",
+        "        probability_matrix = mask_labels\n",
+        "        kp_probability_matrix = kp_mask_labels\n",
+        "\n",
+        "        special_tokens_mask = [\n",
+        "            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()\n",
+        "        ]\n",
+        "        # do zero for special tokens\n",
+        "        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)\n",
+        "        kp_probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)\n",
+        "\n",
+        "        # assert kp_probability_matrix & probability_matrix == 0\n",
+        "        # do zero for padded points\n",
+        "        if self.tokenizer._pad_token is not None:\n",
+        "            padding_mask = labels.eq(self.tokenizer.pad_token_id)\n",
+        "            probability_matrix.masked_fill_(padding_mask, value=0.0)\n",
+        "            kp_probability_matrix.masked_fill_(padding_mask, value=0.0)\n",
+        "\n",
+        "        masked_indices = probability_matrix.bool()\n",
+        "        kp_masked_indices = kp_probability_matrix.bool()\n",
+        "        # get the gold lables\n",
+        "        labels[~(masked_indices | kp_masked_indices)] = -100  # We only compute loss on random masked tokens and kp masked token else is set to -100\n",
+        "\n",
+        "        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])\n",
+        "        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices\n",
+        "        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)\n",
+        "        # 80 % masking for key phrases\n",
+        "        kp_indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & kp_masked_indices\n",
+        "        inputs[kp_indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)\n",
+        "        # generate random tokens\n",
+        "        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)\n",
+        "        # 10% of the time, we replace masked input tokens with random word\n",
+        "        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced\n",
+        "        inputs[indices_random] = random_words[indices_random]\n",
+        "\n",
+        "        # replace 10 # kp tokens with random idices\n",
+        "        kp_indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & kp_masked_indices & ~kp_indices_replaced\n",
+        "        inputs[kp_indices_random] = random_words[kp_indices_random]\n",
+        "        # The rest of the time (10% of the time) we keep the masked input tokens unchanged\n",
+        "        # print(\"inside mask tok functiom \\n\",inputs,\"\\n\", labels,\"\\n\")\n",
+        "\n",
+        "        # generation - t1, t2, t3 (actual) - [MASK], t4 [MASK], t5, t6\n",
+        "        # replacement - t1, t2, t3 (actual) - [MASK], t4 [MASK], t5, t6 (replace) t9\n",
+        "        \n",
+        "        return inputs, labels\n",
+        "\n",
+        "  "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9azrJU6UuPFt"
+      },
+      "source": [
+        "from datasets import load_dataset\n",
+        "def load_klm_dataset(tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):\n",
+        "    \n",
+        "    def pre_process(d):\n",
+        "        kp_pro= tokenizer(d['kp'],add_special_tokens= False, truncation= False)[\"input_ids\"]\n",
+        "        d['input_ids']= tokenizer(d['text'], add_special_tokens=True, truncation=True, max_length=block_size)[\"input_ids\"]\n",
+        "        d['labels'] = kp_pro\n",
+        "        # print(\"inn inn\",d['kp'])\n",
+        "        return d\n",
+        "\n",
+        "\n",
+        "    dataset = load_dataset('json', data_files= file_path, split='train' )\n",
+        "    dataset= dataset.map(pre_process)\n",
+        "    # print(\"inn \", dataset)\n",
+        "    dataset.set_format(columns=[ 'labels', 'input_ids'])\n",
+        "\n",
+        "    return dataset\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9d2Zn2PJvQhl"
+      },
+      "source": [
+        "#  tok(['iam mam'],add_special_tokens= False, truncation= False)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qa4ArAI0kbs_",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "bcc7ec82-6a79-416b-83c4-a9bf904e2536"
+      },
+      "source": [
+        "\n",
+        "# data_set= KLMDataset(tokenizer=tok, file_path=\"/content/dummy.txt\", block_size= 200)\n",
+        "data_set = load_klm_dataset(tokenizer= tok, file_path= \"/content/train.json\", block_size= 124)\n",
+        "# data_set.set_format(columns=[ 'kp', 'input_ids'])\n",
+        "dc= DataCollatorForKLM(tokenizer= tok)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Using custom data configuration default\n",
+            "Reusing dataset json (/content/json/default-16dd99a81353c724/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n",
+            "Loading cached processed dataset at /content/json/default-16dd99a81353c724/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-6d89745262ae7664.arrow\n"
+          ],
+          "name": "stderr"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "mEo-CRzkLGYQ"
+      },
+      "source": [
+        "from transformers import Trainer, TrainingArguments\n",
+        "\n",
+        "training_args = TrainingArguments(\n",
+        "    output_dir=\"/content\",\n",
+        "    overwrite_output_dir=True,\n",
+        "    num_train_epochs=1,\n",
+        "    per_gpu_train_batch_size=64,\n",
+        "    save_steps=10_000,\n",
+        "    save_total_limit=2, # need to save all the models\n",
+        ")\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "3rvp2Yk5LGa7"
+      },
+      "source": [
+        "trainer = Trainer(\n",
+        "    model=model,\n",
+        "    args=training_args,\n",
+        "    data_collator=dc,\n",
+        "    train_dataset= data_set\n",
+        ")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Cv-vnqziLGda",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 201
+        },
+        "outputId": "21fccf71-22e9-4403-b8e0-2bd1f86d60fa"
+      },
+      "source": [
+        "trainer.train()"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n",
+            "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n",
+            "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n"
+          ],
+          "name": "stderr"
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "collator    [{'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}]\n",
+            "{'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "        <style>\n",
+              "            /* Turns off some styling */\n",
+              "            progress {\n",
+              "                /* gets rid of default border in Firefox and Opera. */\n",
+              "                border: none;\n",
+              "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
+              "                background-size: auto;\n",
+              "            }\n",
+              "        </style>\n",
+              "      \n",
+              "      <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [1/1 00:00, Epoch 1/1]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: left;\">\n",
+              "      <th>Step</th>\n",
+              "      <th>Training Loss</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "TrainOutput(global_step=1, training_loss=10.954537391662598)"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 159
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "DDsc-Ld-LGgM"
+      },
+      "source": [
+        "# Data format - {\"text\": ....., \"keyphrases\": [{\"surface_form\": ..., \"start\": ..., \"end\": ...}]}\n",
+        "# format - jsonl one json per line\n",
+        "# dir - 1.jsonl, 2.jsonl, 3.jsonl"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ncOg07xPLGkN"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tC3T7Y35LGnQ"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/notebooks/tranKP.ipynb b/notebooks/tranKP.ipynb
new file mode 100644
index 0000000..7835b24
--- /dev/null
+++ b/notebooks/tranKP.ipynb
@@ -0,0 +1,801 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "tranKP.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HXDqBqrdaoNw",
+        "outputId": "4e45f4d4-324f-44a0-f289-62479ccd56ef"
+      },
+      "source": [
+        "!pip install transformers\n",
+        "!pip install sentencepiece\n",
+        "!pip install datasets"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Collecting transformers\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.5MB 4.2MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.19.4)\n",
+            "Collecting sacremoses\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n",
+            "\u001b[K     |████████████████████████████████| 890kB 17.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.8)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n",
+            "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8)\n",
+            "Collecting tokenizers==0.9.4\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)\n",
+            "\u001b[K     |████████████████████████████████| 2.9MB 21.5MB/s \n",
+            "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.0.0)\n",
+            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.12.5)\n",
+            "Building wheels for collected packages: sacremoses\n",
+            "  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=5eee5bbfe2f9124d4f5d0c0332e4124d253f5989149682918253b6700d942717\n",
+            "  Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n",
+            "Successfully built sacremoses\n",
+            "Installing collected packages: sacremoses, tokenizers, transformers\n",
+            "Successfully installed sacremoses-0.0.43 tokenizers-0.9.4 transformers-4.1.1\n",
+            "Collecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1MB 5.9MB/s \n",
+            "\u001b[?25hInstalling collected packages: sentencepiece\n",
+            "Successfully installed sentencepiece-0.1.94\n",
+            "Collecting datasets\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ee/78/5873ac1e27bf25a2cbf3447d6704edd3136b1b3ff0eb3bfab38a45d2a1ff/datasets-1.2.0-py3-none-any.whl (159kB)\n",
+            "\u001b[K     |████████████████████████████████| 163kB 4.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from datasets) (0.8)\n",
+            "Requirement already satisfied: tqdm<4.50.0,>=4.27 in /usr/local/lib/python3.6/dist-packages (from datasets) (4.41.1)\n",
+            "Collecting xxhash\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)\n",
+            "\u001b[K     |████████████████████████████████| 245kB 6.1MB/s \n",
+            "\u001b[?25hCollecting pyarrow>=0.17.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d7/e1/27958a70848f8f7089bff8d6ebe42519daf01f976d28b481e1bfd52c8097/pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7MB)\n",
+            "\u001b[K     |████████████████████████████████| 17.7MB 1.5MB/s \n",
+            "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from datasets) (0.3.3)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.6/dist-packages (from datasets) (1.19.4)\n",
+            "Requirement already satisfied: multiprocess in /usr/local/lib/python3.6/dist-packages (from datasets) (0.70.11.1)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.23.0)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from datasets) (1.1.5)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2020.12.5)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2.8.1)\n",
+            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2018.9)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n",
+            "Installing collected packages: xxhash, pyarrow, datasets\n",
+            "  Found existing installation: pyarrow 0.14.1\n",
+            "    Uninstalling pyarrow-0.14.1:\n",
+            "      Successfully uninstalled pyarrow-0.14.1\n",
+            "Successfully installed datasets-1.2.0 pyarrow-2.0.0 xxhash-2.0.0\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "r3IcbMLKYCz7"
+      },
+      "source": [
+        "# utils\n",
+        "import os, sys\n",
+        "import argparse\n",
+        "from dataclasses import dataclass, field\n",
+        "from typing import Dict, List, Optional\n",
+        "@dataclass\n",
+        "class BasicKPArgs:\n",
+        "    model_type : Optional[str] = field(\n",
+        "        default=\"enc_dec\",\n",
+        "        metadata= {\"help\": \"encoder decoder type or other generative model like Bart\"}\n",
+        "    )\n",
+        "\n",
+        "    model_name_path : Optional[str] = field(\n",
+        "        default= None,\n",
+        "        metadata= {\"help\": \"path or name to load pretrained model or from checkpoints\"}\n",
+        "    )\n",
+        "    decoder_model_name_path : Optional[str] = field(\n",
+        "        default= None,\n",
+        "        metadata= {\"help\": \"path or name of decoder part of the model in enc_dec architect\"}\n",
+        "    )\n",
+        "    tokenizer_path  : Optional[str] = field(\n",
+        "        default= None,\n",
+        "        metadata= {\"help\": \"path or name of custom tokenizer saved if provided this tokenizer will be loaded else auto tokenizer\"}\n",
+        "    )\n",
+        "    data_dir : Optional[str] = field(\n",
+        "        default= \"\",\n",
+        "        metadata= {\"help\": \"path to dir containg data\"}\n",
+        "    )\n",
+        "    kp_task_type : Optional[str] = field(\n",
+        "        default= \"one2one\",\n",
+        "        metadata= {\"help\": \"wether to use one2one or one2many\"}\n",
+        "    )\n",
+        "    max_src_len : Optional[int] = field(\n",
+        "        \n",
+        "        default= 512,\n",
+        "        metadata= {\"help\": \"length of source seq\" }\n",
+        "    )\n",
+        "    max_tar_len : Optional[int] = field(\n",
+        "        \n",
+        "        default= 64,\n",
+        "        metadata= {\"help\": \"length of target seq\" }\n",
+        "    )\n",
+        "    # this is parsed from training args\n",
+        "    # out_dir: Optional[str] = field(\n",
+        "    #     default= \"\",\n",
+        "    #     metadata= {\"help\": \"path of data dir to save trained weights and out put\"}\n",
+        "    # )\n",
+        "    from_pretrained : Optional[bool] = field(\n",
+        "        default= True,\n",
+        "        metadata= {\"help\": \"wether to load model weight from a pretrained checkpoint or from scratch\"}\n",
+        "    )\n",
+        "    predict_only : Optional[bool] = field(\n",
+        "        default= False,\n",
+        "        metadata= {\"help\": \"wether to predict only or train, validate and predict\"}\n",
+        "    )\n",
+        "    dataset_class : Optional[str] = field(\n",
+        "        default= \"single\",\n",
+        "        metadata= {\"help\": \"single | multiple , type of dataset reader to use, split train data into mltiple train file or from single\" }\n",
+        "    )"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "TgUthX_SaXRY"
+      },
+      "source": [
+        "#datset\n",
+        "import os, sys\n",
+        "import torch\n",
+        "import json\n",
+        "from torch.utils.data.dataset import Dataset\n",
+        "class KPone2manyDataset(Dataset):\n",
+        "    def __init__(self, tokenizer, file_path, max_src_len, max_tar_len, kp_sep_token = \"<kp_sep>\"):\n",
+        "        '''\n",
+        "        file should contain json in each line with\n",
+        "            \"text\": string and \" key phrase\": list[str] containing all kp\n",
+        "        '''\n",
+        "        assert os.path.exists(file_path)\n",
+        "        self.abst= []\n",
+        "        self.kps= []\n",
+        "        self.src_attn_mask = []\n",
+        "        self.tokenizer = tokenizer\n",
+        "        with open(file_path, encoding=\"utf-8\") as f:\n",
+        "            for line in f:\n",
+        "                d=json.loads(line)\n",
+        "                self.abst.append(d['text'])\n",
+        "                curr_kp= \"\"\n",
+        "                for (i,kp) in enumerate(d['kp']):\n",
+        "                    if i !=0:\n",
+        "                        curr_kp += \" \" + kp_sep_token +\" \"\n",
+        "                    curr_kp += kp.strip()\n",
+        "                \n",
+        "                self.kps.append(curr_kp)\n",
+        "        \n",
+        "        assert len(self.kps) == len(self.abst)\n",
+        "        self.ex_len= len(self.abst)\n",
+        "        self.kps= self.tokenizer.batch_encode_plus(self.kps, truncation=True, max_length= max_tar_len, pad_to_max_length= True)\n",
+        "        self.abst= self.tokenizer.batch_encode_plus(self.abst, truncation=True, max_length= max_src_len, pad_to_max_length= True)\n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return self.ex_len\n",
+        "\n",
+        "    def __getitem__(self, i):\n",
+        "        return {\n",
+        "            'src_ids': torch.tensor(self.abst['input_ids'][i]),\n",
+        "            'tar_ids': torch.tensor(self.kps['input_ids'][i]),\n",
+        "            'src_attn': torch.tensor(self.abst['attention_mask'][i]),\n",
+        "            'tar_attn': torch.tensor(self.kps['attention_mask'][i])\n",
+        "            }\n",
+        "\n",
+        "# class kpone2manyMultiDataset(Dataset):\n",
+        "#     def __init__(self, tokenizer, data_dir, file_prefix, n=10000, max_src_len, max_tar_len, kp_sep_token = \"<kp_sep>\"):\n",
+        "#         self.tokenizer = tokenizer\n",
+        "#         self.data_dir = data_dir\n",
+        "#         self.file_prefix = file_prefix\n",
+        "#         self.total_examples = n\n",
+        "#         self.max_src_len = max_src_len\n",
+        "#         self.max_tar_len = max_tar_len\n",
+        "#         self.kp_sep_token = kp_sep_token\n",
+        "\n",
+        "#         assert os.path.exists(self.data_dir)\n",
+        "\n",
+        "\n",
+        "\n",
+        "        \n",
+        "#         pass\n",
+        "#     def read_files(self):\n",
+        "#         pass\n",
+        "\n",
+        "#     def __len__(self):\n",
+        "#         pass\n",
+        "    \n",
+        "#     def __getitem__(self,i):\n",
+        "#         pass\n",
+        "\n",
+        "\n",
+        "\n",
+        "# super dataset class\n",
+        "def load_kp_data_and_dataset_class( tokenizer, file_path, max_src_len, max_tar_len, kp_sep_token = \"<kp_sep>\"):\n",
+        "    from datasets import load_dataset\n",
+        "    def tok_and_process(d):\n",
+        "        curr_kp= \"\"\n",
+        "        for (i,kp) in enumerate(d['kp']):\n",
+        "            if i !=0:\n",
+        "                curr_kp += \" \" + kp_sep_token +\" \"\n",
+        "            curr_kp += kp.strip()\n",
+        "        src_encode= tokenizer(d['text'],  truncation=True, max_length= max_tar_len, pad_to_max_length= True)\n",
+        "        tar_encode= tokenizer(curr_kp, truncation=True, max_length= max_tar_len, pad_to_max_length= True)\n",
+        "        d['input_ids'] = src_encode['input_ids']\n",
+        "        d['decoder_input_ids']= tar_encode['input_ids']\n",
+        "        d['attention_mask']= src_encode['attention_mask']\n",
+        "        # d['tar_attn'] = tar_encode['attention_mask']\n",
+        "\n",
+        "        return d\n",
+        "    \n",
+        "\n",
+        "    dataset = load_dataset('json', data_files= file_path, split='train')\n",
+        "    dataset= dataset.map(tok_and_process)\n",
+        "    dataset.set_format(type='torch', columns=['input_ids', 'decoder_input_ids', 'attention_mask'])\n",
+        "\n",
+        "    return dataset\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zL3jsUs2aXUI"
+      },
+      "source": [
+        "#collate\n",
+        "import os, sys\n",
+        "import torch\n",
+        "class TPBDataCollator():\n",
+        "    def __init__(self, tokenizer, need_to_shift= False, start_tok_id= None):\n",
+        "        self.tokenizer = tokenizer\n",
+        "        self.shift_right= need_to_shift\n",
+        "        self.dec_start_tok_id= self.tokenizer.pad_token_id if start_tok_id is None else start_tok_id #generally same as pad token id\n",
+        "\n",
+        "\n",
+        "    def __call__(self, ex):\n",
+        "        # print(ex)\n",
+        "        src_ids= torch.stack([e['input_ids'] for e in ex])\n",
+        "        tar_ids= torch.stack([e['decoder_input_ids'] for e in ex])\n",
+        "        src_attn_mask= torch.stack([e['attention_mask'] for e in ex])\n",
+        "        # src_ids= [e['src_ids'] for e in ex]\n",
+        "        # tar_ids= [e['tar_ids'] for e in ex]\n",
+        "        # src_attn_mask= [e['src_attn'] for e in ex]\n",
+        "        # tar_attn_mask = torch.stack([e['tar_attn'] for e in ex])\n",
+        "        # create labels for loss calcualtiona\n",
+        "        labels= tar_ids.clone()\n",
+        "        labels[labels[:]== self.tokenizer.pad_token_id] = -100 #ignore loss at pad token ids\n",
+        "\n",
+        "        # get decoder input ids\n",
+        "\n",
+        "        if self.shift_right: # either shift right for bart/pegasus/t5 or pass decodeer ids as none for bart/ pegasus then it will create decoder ids by shifting labels to right\n",
+        "            decoder_ids=  self.right_shift(tar_ids)\n",
+        "            \n",
+        "        else:\n",
+        "            decoder_ids= tar_ids\n",
+        "\n",
+        "\n",
+        "        batch = {\n",
+        "            \"input_ids\": src_ids,\n",
+        "            \"attention_mask\" : src_attn_mask,\n",
+        "            \"decoder_input_ids\": decoder_ids,\n",
+        "            \"labels\": labels\n",
+        "        }\n",
+        "\n",
+        "        return batch\n",
+        "\n",
+        "\n",
+        "    def right_shift(self, input_ids):\n",
+        "        pad_token_id= self.dec_start_tok_id  # same as pad token id\n",
+        "        prev_output_tokens = input_ids.clone()\n",
+        "        assert pad_token_id is not None, \"self.model.config.pad_token_id has to be defined.\"\n",
+        "        # replace possible -100 values in labels by `pad_token_id`\n",
+        "        prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)\n",
+        "\n",
+        "        index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)\n",
+        "        decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()\n",
+        "        prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()\n",
+        "        prev_output_tokens[:, 0] = decoder_start_tokens\n",
+        "        return prev_output_tokens\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "W8r6v-NJaXWs"
+      },
+      "source": [
+        "#main\n",
+        "import os, sys\n",
+        "# from utils import arg_parse\n",
+        "from transformers import (\n",
+        "    AutoTokenizer,\n",
+        "    EncoderDecoderModel,\n",
+        "    BartTokenizerFast,\n",
+        "    AutoModelForSeq2SeqLM,\n",
+        "    AutoConfig,\n",
+        "    Trainer,\n",
+        "    TrainingArguments,\n",
+        "    HfArgumentParser\n",
+        ")\n",
+        "\n",
+        "# from dataset_fn import *\n",
+        "# from collate_fn import *\n",
+        "\n",
+        "\n",
+        "COLLATE_DICT= {\n",
+        "    't5': TPBDataCollator\n",
+        "\n",
+        "\n",
+        "}\n",
+        "\n",
+        "DATASET_DICT= {\n",
+        "    # 'one2many_single': KPone2manyDataset\n",
+        "    'one2many_single': load_kp_data_and_dataset_class\n",
+        "\n",
+        "}\n",
+        "\n",
+        "CONFIG_MAP = {\n",
+        "\n",
+        "}\n",
+        "\n",
+        "\n",
+        "TOKENIZER_MAP = {\n",
+        "    \n",
+        "\n",
+        "}\n",
+        "\n",
+        "MODEL_MAP = {\n",
+        "\n",
+        "}\n",
+        "# TODO\n",
+        "# modify tokenizer in main function if there is requirement of special token addition and stuff\n",
+        "# chek if there is crosss ateention enabled in decoder part of the model and its working\n",
+        "# see if special token needed and shifting or other requirement->>>> one at a time\n",
+        "#   1. bart model\n",
+        "#   2. t5\n",
+        "#   3. pegasus \n",
+        "# add <kp_sep> token in every tokenizer and keeep rest same, qg has better logic\n",
+        "#token shifting in bart t5 pegasus\n",
+        "    # t5 tokenizer genrate token as required( there is need to shift right), but bart and pegasus add cls/bos and sep/eos in start and end and it also shifts automatically\n",
+        "    # for bart and pegasus simply copying target seq as labels and target seq as decodeer ip could be tried as these model automatically shift to right\n",
+        "    #final: bart shifts label (i.e target seq ) to right if passed decoder ip ids is none so only labels and input ids can be passed can be passed. if you want you cann remove [cls]/[sep] token as required\n",
+        "    #pegasus; same as bart\n",
+        "#encode decoder: look for shifting\n",
+        "    # cls can be use as bos and sep as eos: this is mentioned in HF blogs\n",
+        "# how to levare seq2seq trainer or trainer directly\n",
+        "    # trainer and seq2seq trainer seems to be the same thing, we can try them alternative and can see which is best\n",
+        "    # \n",
+        "\n",
+        "# add compute metrics\n",
+        "\n",
+        "# add do predict and generate function option\n",
+        "\n",
+        "def main_fn(args= None, training_args = None):\n",
+        "    #ars parsing\n",
+        "    # parser= HfArgumentParser((BasicKPArgs, TrainingArguments))\n",
+        "    # args , training_args = parser.parse_args_into_dataclasses()\n",
+        "    \n",
+        "    #load tokenizer\n",
+        "    if args.tokenizer_path is not None:\n",
+        "        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)\n",
+        "    else:\n",
+        "        tokenizer= AutoTokenizer.from_pretrained(args.model_name_path)\n",
+        "    tokenizer.add_tokens(['<kp_sep>'])\n",
+        "        # tokenizer.sep_token = \"<sep>\"\n",
+        "        #save tokenizer\n",
+        "    tok_path= training_args.output_dir+\"/kp_{}_tokenizer\".format(args.model_name_path )\n",
+        "    if not os.path.exists(tok_path):\n",
+        "        os.mkdir(tok_path)\n",
+        "    tokenizer.save_pretrained(tok_path)\n",
+        "\n",
+        "    \n",
+        "    #load model\n",
+        "    if args.model_type == \"enc_dec\":\n",
+        "        model =None\n",
+        "    else:\n",
+        "        if args.from_pretrained:\n",
+        "            model  = AutoModelForSeq2SeqLM.from_pretrained(\n",
+        "                args.model_name_path\n",
+        "            )\n",
+        "        else:\n",
+        "            config= AutoConfig.from_pretrained(args.model_name_path) #get the config file to load weight from scratch\n",
+        "            model= AutoModelForSeq2SeqLM.from_config(config) #load model with random weight from config\n",
+        "\n",
+        "    #resize model embedding\n",
+        "    model.resize_token_embeddings(len(tokenizer))\n",
+        "\n",
+        "    #freeze model embedding\n",
+        "\n",
+        "    #datset class\n",
+        "    \n",
+        "    train_data_set= DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir + \"/train.txt\", max_src_len=  args.max_src_len, max_tar_len = args.max_tar_len)\n",
+        "\n",
+        "    eval_data_set= DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+\"/val.txt\", max_src_len=  args.max_src_len, max_tar_len = args.max_tar_len)\n",
+        "    \n",
+        "    # print(train_data_set)\n",
+        "\n",
+        "    #data collator\n",
+        "    data_collator= COLLATE_DICT[args.model_type](tokenizer= tokenizer, need_to_shift= True)\n",
+        "\n",
+        "    trainer= Trainer(model= model,\n",
+        "                 args= training_args,\n",
+        "                 data_collator= data_collator,\n",
+        "                 train_dataset = train_data_set,\n",
+        "                 eval_dataset= eval_data_set,\n",
+        "                #  compute_metrics= None, # metrics to compute scores,\n",
+        "\n",
+        "\n",
+        "                 )\n",
+        "    \n",
+        "    if args.predict_only:\n",
+        "        test_data_set = DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+\"/test.txt\", max_src_len=  args.max_src_len, max_tar_len = args.max_tar_len)\n",
+        "        \n",
+        "    \n",
+        "    trainer.train()\n",
+        "\n",
+        "\n",
+        "    \n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SognzQU6aXY7"
+      },
+      "source": [
+        "def runner():\n",
+        "    args= BasicKPArgs(\n",
+        "        model_type = 't5',\n",
+        "        model_name_path = 't5-base', #todo\n",
+        "        data_dir= \"/content\", #todo\n",
+        "        kp_task_type= \"one2many\",\n",
+        "        dataset_class= 'single'\n",
+        "    )\n",
+        "    training_args = TrainingArguments(\n",
+        "        output_dir= \"/content/tk_out\", #todo\n",
+        "        overwrite_output_dir = True,\n",
+        "        num_train_epochs = 2,\n",
+        "        per_device_train_batch_size = 8,\n",
+        "        do_eval= True,\n",
+        "        evaluation_strategy = \"epoch\",\n",
+        "        save_steps = 1\n",
+        "        \n",
+        "        \n",
+        "\n",
+        "    )\n",
+        "    main_fn(args, training_args)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "background_save": true,
+          "base_uri": "https://localhost:8080/",
+          "height": 290
+        },
+        "id": "EMXA4Yi6aXbV",
+        "outputId": "f091bbbd-d403-48a9-8089-ce746dd66cb3"
+      },
+      "source": [
+        "runner()"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n",
+            "- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+            "Using custom data configuration default\n",
+            "Reusing dataset json (/root/.cache/huggingface/datasets/json/default-0a0c845d87888c0a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n",
+            "/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:2179: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+            "  FutureWarning,\n",
+            "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-0a0c845d87888c0a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-463e16185536cbff.arrow\n",
+            "Using custom data configuration default\n",
+            "Reusing dataset json (/root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n",
+            "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-cea35ead7b156669.arrow\n"
+          ],
+          "name": "stderr"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "        <style>\n",
+              "            /* Turns off some styling */\n",
+              "            progress {\n",
+              "                /* gets rid of default border in Firefox and Opera. */\n",
+              "                border: none;\n",
+              "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
+              "                background-size: auto;\n",
+              "            }\n",
+              "        </style>\n",
+              "      \n",
+              "      <progress value='2' max='6' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [2/6 : < :, Epoch 0.33/2]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: left;\">\n",
+              "      <th>Epoch</th>\n",
+              "      <th>Training Loss</th>\n",
+              "      <th>Validation Loss</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "error",
+          "ename": "RuntimeError",
+          "evalue": "ignored",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)\u001b[0m\n\u001b[1;32m    371\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0m_open_zipfile_writer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m                 \u001b[0m_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_protocol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    373\u001b[0m                 \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36m_save\u001b[0;34m(obj, zip_file, pickle_module, pickle_protocol)\u001b[0m\n\u001b[1;32m    486\u001b[0m             \u001b[0mnum_bytes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0melement_size\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m             \u001b[0mzip_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_record\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_ptr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_bytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    488\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
+            "\nDuring handling of the above exception, another exception occurred:\n",
+            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+            "\u001b[0;32m<ipython-input-11-2a03703c532d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrunner\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;32m<ipython-input-10-d35da4e323f1>\u001b[0m in \u001b[0;36mrunner\u001b[0;34m()\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     20\u001b[0m     )\n\u001b[0;32m---> 21\u001b[0;31m     \u001b[0mmain_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+            "\u001b[0;32m<ipython-input-9-bd31ca55a059>\u001b[0m in \u001b[0;36mmain_fn\u001b[0;34m(args, training_args)\u001b[0m\n\u001b[1;32m    126\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m     \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    130\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, model_path, trial)\u001b[0m\n\u001b[1;32m    836\u001b[0m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    837\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 838\u001b[0;31m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_log_save_evaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtr_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    839\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    840\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_epoch_stop\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_training_stop\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, model, trial, epoch)\u001b[0m\n\u001b[1;32m    908\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    909\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_save\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 910\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_save_checkpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    911\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    912\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_save_checkpoint\u001b[0;34m(self, model, trial, metrics)\u001b[0m\n\u001b[1;32m    941\u001b[0m                 \u001b[0mreissue_pt_warnings\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcaught_warnings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    942\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_world_process_zero\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 943\u001b[0;31m             \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"optimizer.pt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    944\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mwarnings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcatch_warnings\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcaught_warnings\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    945\u001b[0m                 \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlr_scheduler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"scheduler.pt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)\u001b[0m\n\u001b[1;32m    371\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0m_open_zipfile_writer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    372\u001b[0m                 \u001b[0m_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_protocol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 373\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    374\u001b[0m         \u001b[0m_legacy_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopened_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_protocol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    375\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36m__exit__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    257\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    258\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__exit__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 259\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_like\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_end_of_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    260\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    261\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:274] . unexpected pos 1606179904 vs 1606179792"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5CaaHimjIfOR"
+      },
+      "source": [
+        "# 3 eval_kp.py\n",
+        "import os, sys\n",
+        "# from utils import arg_parse\n",
+        "from transformers import (\n",
+        "    AutoTokenizer,\n",
+        "    EncoderDecoderModel,\n",
+        "    BartTokenizerFast,\n",
+        "    AutoModelForSeq2SeqLM,\n",
+        "    AutoConfig,\n",
+        "    Trainer,\n",
+        "    TrainingArguments,\n",
+        "    HfArgumentParser\n",
+        ")\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "@dataclass\n",
+        "class EvalArgs:\n",
+        "    model_type : Optional[str] = field(\n",
+        "        default=\"enc_dec\",\n",
+        "        metadata= {\"help\": \"encoder decoder type or other generative model like Bart\"}\n",
+        "    )\n",
+        "\n",
+        "    model_name_path : Optional[str] = field(\n",
+        "        default= None,\n",
+        "        metadata= {\"help\": \"path or name to load pretrained model or from checkpoints\"}\n",
+        "    )\n",
+        "    tokenizer_path  : Optional[str] = field(\n",
+        "        default= None,\n",
+        "        metadata= {\"help\": \"path or name of custom tokenizer saved if provided this tokenizer will be loaded else auto tokenizer\"}\n",
+        "    )\n",
+        "    data_dir : Optional[str] = field(\n",
+        "        default= \"\",\n",
+        "        metadata= {\"help\": \"path to dir containg data\"}\n",
+        "    )\n",
+        "    kp_task_type : Optional[str] = field(\n",
+        "        default= \"one2one\",\n",
+        "        metadata= {\"help\": \"wether to use one2one or one2many\"}\n",
+        "    )\n",
+        "    dataset_class : Optional[str] = field(\n",
+        "        default= \"single\",\n",
+        "        metadata= {\"help\": \"single | multiple , type of dataset reader to use, split train data into mltiple train file or from single\" }\n",
+        "    )\n",
+        "    beam_size : Optional[int] = field(\n",
+        "        \n",
+        "        default= 4,\n",
+        "        metadata= {\"help\": \"beam_size\" }\n",
+        "    )\n",
+        "    max_pre_len : Optional[int] = field(\n",
+        "        \n",
+        "        default= 64,\n",
+        "        metadata= {\"help\": \"length of target seq\" }\n",
+        "    )\n",
+        "    max_src_len : Optional[int] = field(\n",
+        "        \n",
+        "        default= 512,\n",
+        "        metadata= {\"help\": \"length of source seq\" }\n",
+        "    )\n",
+        "\n",
+        "COLLATE_DICT= {\n",
+        "    't5': TPBDataCollator\n",
+        "\n",
+        "\n",
+        "}\n",
+        "def main_eval(args= None):\n",
+        "    # p = HfArgumentParser((EvalArgs,))\n",
+        "    # args= p.parse_args_into_dataclasses()[0]\n",
+        "   \n",
+        "\n",
+        "    device= 'cuda' if torch.cuda.is_available else 'cpu'\n",
+        "    print(\"device \", device)\n",
+        "    if args.tokenizer_path is not None:\n",
+        "        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path )\n",
+        "    else:\n",
+        "        tokenizer= AutoTokenizer.from_pretrained(args.model_name_path)\n",
+        "    tokenizer.add_tokens(['<kp_sep>'])\n",
+        "\n",
+        "    if args.model_type == \"enc_dec\":\n",
+        "        model =None\n",
+        "    else:\n",
+        "        model  = AutoModelForSeq2SeqLM.from_pretrained(\n",
+        "            args.model_name_path\n",
+        "        )\n",
+        "    data_collator= COLLATE_DICT[args.model_type](tokenizer= tokenizer, need_to_shift= True)\n",
+        "    test_data_set = DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+\"/test.txt\", max_src_len=  args.max_src_len, max_tar_len = args.max_pre_len)\n",
+        "\n",
+        "    data_loader= torch.utils.data.DataLoader(test_data_set, batch_size= 16, collate_fn= data_collator)\n",
+        "\n",
+        "    model.to(device)\n",
+        "    model.eval()\n",
+        "    out_writer= open(args.data_dir+\"prediction.txt\", 'w')\n",
+        "\n",
+        "    with torch.no_grad():\n",
+        "        for ex in data_loader:\n",
+        "            generated= model.generate(\n",
+        "                input_ids= ex['input_ids'].to(device),\n",
+        "                attention_mask= ex['attention_mask'].to(device),\n",
+        "                num_beams= args.beam_size,\n",
+        "                max_length= args.max_pre_len\n",
+        "\n",
+        "\n",
+        "            )\n",
+        "\n",
+        "            pre= [tokenizer.decode(op, skip_special_token= True) for op in generated]\n",
+        "            for p in pre:\n",
+        "                out_writer.write(p+\"\\n\")\n",
+        "            \n",
+        "    \n",
+        "    print(\"files written in dir {} as prediction.txt \".format(args.data_dir))\n",
+        "\n",
+        "    out_writer.close()\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "BP6VdOBZIfRi"
+      },
+      "source": [
+        "args= EvalArgs(\n",
+        "        model_type= 't5',\n",
+        "        model_name_path= \"/content/tk_out/checkpoint-6\",\n",
+        "        tokenizer_path=  \"/content/tk_out/kp_t5-base_tokenizer/\",\n",
+        "        data_dir= \"/content/\", #todo\n",
+        "        kp_task_type= \"one2many\",\n",
+        "        dataset_class= 'single',\n",
+        "        beam_size= 4,\n",
+        "        max_pre_len = 64\n",
+        "\n",
+        "\n",
+        "    )\n",
+        "main_eval(args)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "iU7vhi4nbnmx"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "joA6Zo0SIfUf"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "iVByVZnaaXdl"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file

From 000b0375984252f09517a10b1ca13b7fce701b28 Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Sun, 23 Jan 2022 21:17:37 +0530
Subject: [PATCH 03/17] beautify

---
 dlkp/models/ke/crf/crf.py                     |  24 +-
 dlkp/models/ke/crf/crf_trainer.py             | 140 ++--
 dlkp/models/ke/crf/crf_utils.py               |  29 +-
 dlkp/models/ke/kpe.py                         | 631 ++++++++++++++++++
 dlkp/models/ke/transformer/crf_models.py      | 152 +++++
 .../token_classification_models.py            |  94 +++
 notebooks/tranKP.ipynb                        | 189 ++----
 7 files changed, 1060 insertions(+), 199 deletions(-)

diff --git a/dlkp/models/ke/crf/crf.py b/dlkp/models/ke/crf/crf.py
index 8d5bd30..2ab2181 100644
--- a/dlkp/models/ke/crf/crf.py
+++ b/dlkp/models/ke/crf/crf.py
@@ -5,7 +5,8 @@
 
 import torch
 
-VITERBI_DECODING = Tuple[List[int], float]  
+VITERBI_DECODING = Tuple[List[int], float]
+
 
 class ConditionalRandomField(torch.nn.Module):
     """
@@ -24,7 +25,6 @@ class ConditionalRandomField(torch.nn.Module):
         Whether to include the start and end transition parameters.
     """
 
-
     def __init__(
         self,
         num_tags: int,
@@ -64,7 +64,9 @@ def reset_parameters(self):
             torch.nn.init.normal_(self.start_transitions)
             torch.nn.init.normal_(self.end_transitions)
 
-    def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+    def _input_likelihood(
+        self, logits: torch.Tensor, mask: torch.BoolTensor
+    ) -> torch.Tensor:
         """
         Computes the (batch_size,) denominator term for the log-likelihood, which is the
         sum of the likelihoods across all possible state sequences.
@@ -161,7 +163,9 @@ def _joint_likelihood(
 
         # Add the last input if it's not masked.
         last_inputs = logits[-1]  # (batch_size, num_tags)
-        last_input_score = last_inputs.gather(1, last_tags.view(-1, 1))  # (batch_size, 1)
+        last_input_score = last_inputs.gather(
+            1, last_tags.view(-1, 1)
+        )  # (batch_size, 1)
         last_input_score = last_input_score.squeeze()  # (batch_size,)
 
         score = score + last_transition_score + last_input_score * mask[-1]
@@ -181,7 +185,7 @@ def forward(
             # The code below fails in weird ways if this isn't a bool tensor, so we make sure.
             mask = mask.to(torch.bool)
         # print("forward",inputs.shape, tags.shape, mask.shape)
-       
+
         log_denominator = self._input_likelihood(inputs, mask)
         # temp_tags= tags
         # tags[tags==-100]=2
@@ -235,9 +239,13 @@ def viterbi_tags(
             ].data + -10000.0 * (
                 1 - self._constraint_mask[start_tag, :num_tags].detach()
             )
-            transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[
+            transitions[
                 :num_tags, end_tag
-            ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())
+            ] = self.end_transitions.detach() * self._constraint_mask[
+                :num_tags, end_tag
+            ].data + -10000.0 * (
+                1 - self._constraint_mask[:num_tags, end_tag].detach()
+            )
         else:
             transitions[start_tag, :num_tags] = -10000.0 * (
                 1 - self._constraint_mask[start_tag, :num_tags].detach()
@@ -280,4 +288,4 @@ def viterbi_tags(
         if flatten_output:
             return [top_k_paths[0] for top_k_paths in best_paths]
 
-        return best_paths
\ No newline at end of file
+        return best_paths
diff --git a/dlkp/models/ke/crf/crf_trainer.py b/dlkp/models/ke/crf/crf_trainer.py
index d9b22df..5db3178 100644
--- a/dlkp/models/ke/crf/crf_trainer.py
+++ b/dlkp/models/ke/crf/crf_trainer.py
@@ -1,17 +1,16 @@
 from transformers import (
-   
     Trainer,
     set_seed,
-    
 )
-from transformers.trainer  import *
+from transformers.trainer import *
 from transformers.trainer_utils import PredictionOutput
 from torch import nn
 from torch.utils.data.dataloader import DataLoader
+
 # from torch.utils.data.dataset import Dataset
 # from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 class CRF_Trainer(Trainer):
-  def prediction_loop(
+    def prediction_loop(
         self,
         dataloader: DataLoader,
         description: str,
@@ -27,13 +26,17 @@ def prediction_loop(
         if not isinstance(dataloader.dataset, collections.abc.Sized):
             raise ValueError("dataset must implement __len__")
         prediction_loss_only = (
-            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
+            prediction_loss_only
+            if prediction_loss_only is not None
+            else self.args.prediction_loss_only
         )
 
         if self.args.deepspeed and not self.args.do_train:
             # no harm, but flagging to the user that deepspeed config is ignored for eval
             # flagging only for when --do_train wasn't passed as only then it's redundant
-            logger.info("Detected the deepspeed argument but it will not be used for evaluation")
+            logger.info(
+                "Detected the deepspeed argument but it will not be used for evaluation"
+            )
 
         model = self._wrap_model(self.model, training=False)
 
@@ -53,65 +56,98 @@ def prediction_loop(
 
         world_size = max(1, self.args.world_size)
 
-        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
+        eval_losses_gatherer = DistributedTensorGatherer(
+            world_size, num_examples, make_multiple_of=batch_size
+        )
         if not prediction_loss_only:
             # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
             # a batch size to the sampler)
             make_multiple_of = None
-            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
+            if hasattr(dataloader, "sampler") and isinstance(
+                dataloader.sampler, SequentialDistributedSampler
+            ):
                 make_multiple_of = dataloader.sampler.batch_size
-            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
-            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            preds_gatherer = DistributedTensorGatherer(
+                world_size, num_examples, make_multiple_of=make_multiple_of
+            )
+            labels_gatherer = DistributedTensorGatherer(
+                world_size, num_examples, make_multiple_of=make_multiple_of
+            )
         if self.args.past_index >= 0:
             self._past = None
         model.eval()
-        
-        if is_torch_tpu_available():
-            dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device)
 
-        
+        if is_torch_tpu_available():
+            dataloader = pl.ParallelLoader(
+                dataloader, [self.args.device]
+            ).per_device_loader(self.args.device)
 
         self.callback_handler.eval_dataloader = dataloader
-        
+
         for step, inputs in enumerate(dataloader):
 
-            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            
-            best_path= self.eval_step(model, logits, inputs['attention_mask'])
+            loss, logits, labels = self.prediction_step(
+                model, inputs, prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+            best_path = self.eval_step(model, logits, inputs["attention_mask"])
             # best_path= self.eval_step(model, logits)
             # print(len(best_path), best_path[0])
             # logits= torch.zeros()
 
-            best_path= [x for x,_ in best_path]
+            best_path = [x for x, _ in best_path]
             # print(best_path)
             # seq_len= labels.shape[1]
-            logits*=0
-            for i,path in enumerate(best_path):
-              # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1])
-              # print(len(x))
-              for j, tag in enumerate(path):
-                logits[i,j,int(tag)]=1
-                # print(inputs['attention_mask'][i,j], labels[i,j])
-                
+            logits *= 0
+            for i, path in enumerate(best_path):
+                # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1])
+                # print(len(x))
+                for j, tag in enumerate(path):
+                    logits[i, j, int(tag)] = 1
+                    # print(inputs['attention_mask'][i,j], labels[i,j])
+
             # logits= torch.tensor(data=best_path, dtype= labels.dtype, device= labels.device)
             # if(logits.shape!=labels.shape):
             #   print(logits.shape,labels.shape)
             # assert logits.shape==labels.shape
             if loss is not None:
                 losses = loss.repeat(batch_size)
-                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
+                losses_host = (
+                    losses
+                    if losses_host is None
+                    else torch.cat((losses_host, losses), dim=0)
+                )
             if logits is not None:
-                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+                preds_host = (
+                    logits
+                    if preds_host is None
+                    else nested_concat(preds_host, logits, padding_index=-100)
+                )
             if labels is not None:
-                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
-            self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control)
+                labels_host = (
+                    labels
+                    if labels_host is None
+                    else nested_concat(labels_host, labels, padding_index=-100)
+                )
+            self.control = self.callback_handler.on_prediction_step(
+                self.args, self.state, self.control
+            )
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0:
-                eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
+            if (
+                self.args.eval_accumulation_steps is not None
+                and (step + 1) % self.args.eval_accumulation_steps == 0
+            ):
+                eval_losses_gatherer.add_arrays(
+                    self._gather_and_numpify(losses_host, "eval_losses")
+                )
                 if not prediction_loss_only:
-                    preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-                    labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+                    preds_gatherer.add_arrays(
+                        self._gather_and_numpify(preds_host, "eval_preds")
+                    )
+                    labels_gatherer.add_arrays(
+                        self._gather_and_numpify(labels_host, "eval_label_ids")
+                    )
 
                 # Set back to None to begin a new accumulation
                 losses_host, preds_host, labels_host = None, None, None
@@ -121,17 +157,29 @@ def prediction_loop(
             delattr(self, "_past")
 
         # Gather all remaining tensors and put them back on the CPU
-        eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
+        eval_losses_gatherer.add_arrays(
+            self._gather_and_numpify(losses_host, "eval_losses")
+        )
         if not prediction_loss_only:
-            preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
-            labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
+            preds_gatherer.add_arrays(
+                self._gather_and_numpify(preds_host, "eval_preds")
+            )
+            labels_gatherer.add_arrays(
+                self._gather_and_numpify(labels_host, "eval_label_ids")
+            )
 
         eval_loss = eval_losses_gatherer.finalize()
         preds = preds_gatherer.finalize() if not prediction_loss_only else None
         label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
 
-        if self.compute_metrics is not None and preds is not None and label_ids is not None:
-            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+        if (
+            self.compute_metrics is not None
+            and preds is not None
+            and label_ids is not None
+        ):
+            metrics = self.compute_metrics(
+                EvalPrediction(predictions=preds, label_ids=label_ids)
+            )
         else:
             metrics = {}
 
@@ -147,20 +195,14 @@ def prediction_loop(
                 metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
 
         return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
-  def eval_step(self,
-        model: nn.Module,
-        logits,
-        mask= None,
-        top_k= None
 
-        ):
+    def eval_step(self, model: nn.Module, logits, mask=None, top_k=None):
         with torch.no_grad():
-          output= model.crf.viterbi_tags(logits, mask, top_k)
+            output = model.crf.viterbi_tags(logits, mask, top_k)
 
         return output
-    
 
-  def compute_loss(self, model, inputs, return_outputs=False):
+    def compute_loss(self, model, inputs, return_outputs=False):
         """
         How the loss is computed by Trainer. By default, all models return the loss in the first element.
 
diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py
index 295aeef..1d1c240 100644
--- a/dlkp/models/ke/crf/crf_utils.py
+++ b/dlkp/models/ke/crf/crf_utils.py
@@ -11,7 +11,9 @@
 VITERBI_DECODING = Tuple[List[int], float]  # a list of tags, and a viterbi score
 
 
-def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]:
+def allowed_transitions(
+    constraint_type: str, labels: Dict[int, str]
+) -> List[Tuple[int, int]]:
     """
     Given labels and a constraint type, returns the allowed transitions. It will
     additionally include transitions for the start and end states, which are used
@@ -30,7 +32,10 @@ def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tu
     num_labels = len(labels)
     start_tag = num_labels
     end_tag = num_labels + 1
-    labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")]
+    labels_with_boundaries = list(labels.items()) + [
+        (start_tag, "START"),
+        (end_tag, "END"),
+    ]
 
     allowed = []
     for from_label_index, from_label in labels_with_boundaries:
@@ -47,7 +52,9 @@ def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tu
             else:
                 to_tag = to_label[0]
                 to_entity = to_label[1:]
-            if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity):
+            if is_transition_allowed(
+                constraint_type, from_tag, from_entity, to_tag, to_entity
+            ):
                 allowed.append((from_label_index, to_label_index))
     return allowed
 
@@ -97,7 +104,9 @@ def is_transition_allowed(
                 from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"),
                 # B-x can only transition to I-x or L-x
                 # I-x can only transition to I-x or L-x
-                from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity,
+                from_tag in ("B", "I")
+                and to_tag in ("I", "L")
+                and from_entity == to_entity,
             ]
         )
     elif constraint_type == "BIO":
@@ -148,7 +157,9 @@ def is_transition_allowed(
         print("error in constrint type")
 
 
-def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor:
+def logsumexp(
+    tensor: torch.Tensor, dim: int = -1, keepdim: bool = False
+) -> torch.Tensor:
     """
     A numerically stable computation of logsumexp. This is mathematically equivalent to
     `tensor.exp().sum(dim, keep=keepdim).log()`.  This function is typically used for summing log
@@ -169,8 +180,6 @@ def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> tor
     return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log()
 
 
-
-
 def viterbi_decode(
     tag_sequence: torch.Tensor,
     transition_matrix: torch.Tensor,
@@ -223,7 +232,9 @@ def viterbi_decode(
     elif top_k >= 1:
         flatten_output = False
     else:
-        raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}")
+        raise ValueError(
+            f"top_k must be either None or an integer >=1. Instead received {top_k}"
+        )
 
     sequence_length, num_tags = list(tag_sequence.size())
 
@@ -343,4 +354,4 @@ def viterbi_decode(
     if flatten_output:
         return viterbi_paths[0], viterbi_scores[0]
 
-    return viterbi_paths, viterbi_scores
\ No newline at end of file
+    return viterbi_paths, viterbi_scores
diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index e69de29..29e4287 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -0,0 +1,631 @@
+# run_kpe.py
+# all long docu,emt modesl realted to KP
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for token classification.
+"""
+# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
+# comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from datasets import ClassLabel, load_dataset, load_metric
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    DataCollatorForTokenClassification,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+    BertForTokenClassification,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+
+
+logger = logging.getLogger(__name__)
+# from models.long_doc_kp_models import LONG_DOC_KP_MODELS
+
+# KPE_MODELS_DICT={
+#     'others': AutoModelForTokenClassification,
+#     "longformer":
+#     'reformer' :
+#     'crf_longformer' :
+#     'crf_bert': BERT_CRFforTokenClassification
+# }
+
+CRF_MODEL_DICT = {
+    "bert": BERT_CRFforTokenClassification,
+    "longformer": Longformer_CRFforTokenClassification,
+}
+TOKEN_MODEL_DICT = {
+    "bert": BertForTokenClassification,
+    "longformer": LongformerForTokenClassification,
+    "reformer": ReformerForTokenClassification,
+}
+
+MODEL_DICT = {"crf": CRF_MODEL_DICT, "simple": TOKEN_MODEL_DICT}
+
+# KPE_MODELS_DICT = KPE_MODELS_DICT | LONG_DOC_KP_MODELS
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_family_name: str = field(
+        metadata={
+            "help": "name of the family of model, bert, longformer, reformer etc."
+        }
+    )
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        }
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained config name or path if not the same as model_name"
+        },
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name"
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={
+            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
+        },
+    )
+    use_CRF: bool = field(
+        default=False,
+        metadata={"help": "wether to use CRF on top of the classifier"},
+    )
+    use_BiLSTM: bool = field(
+        default=False,
+        metadata={"help": "use BiLSTM in sequence classification"},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: Optional[str] = field(
+        default="simple", metadata={"help": "The name of the task simple, crf"}
+    )
+
+    train_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "The input training data file (a csv or JSON file)."},
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to predict on (a csv or JSON file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    label_all_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
+            "one (in which case the other tokens will have a padding index)."
+        },
+    )
+    return_entity_level_metrics: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to return all the entity levels during evaluation or just the overall ones."
+        },
+    )
+    dataset_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the dataset to use (via the datasets library)."},
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the dataset to use (via the datasets library)."
+        },
+    )
+    cache_file_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+        ):
+            raise ValueError(
+                "Need either a dataset name or a training/validation file."
+            )
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`validation_file` should be a csv or a json file."
+        self.task_name = self.task_name.lower()
+
+
+# def main():
+TRAINER_DICT = {
+    "crf": CRF_Trainer,
+    "simple": Trainer,
+}
+
+
+def main_run_kpe(model_args, data_args, training_args):
+
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+    #     # If we pass only one argument to the script and it's the path to a json file,
+    #     # let's parse it to get our arguments.
+    #     model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    # else:
+    #     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if (
+        os.path.isdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(
+        logging.INFO if is_main_process(training_args.local_rank) else logging.WARN
+    )
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    ## get dataset in here
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+        extension = data_args.train_file.split(".")[-1]
+        datasets = load_dataset(
+            extension, data_files=data_files
+        )  ##CR get dataset in here
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+        features = datasets["train"].features
+    else:
+        column_names = datasets["validation"].column_names
+        features = datasets["validation"].features
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    label_column_name = "BIO_tags" if "BIO_tags" in column_names else column_names[1]
+
+    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
+    # unique labels.
+    def get_label_list(labels):
+        unique_labels = set()
+        for label in labels:
+            unique_labels = unique_labels | set(label)
+        label_list = list(unique_labels)
+        label_list.sort()
+        return label_list
+
+    if isinstance(features[label_column_name].feature, ClassLabel):
+        label_list = features[label_column_name].feature.names
+        # No need to convert the labels since they are already ints.
+        label_to_id = {i: i for i in range(len(label_list))}
+    else:
+        label_list = get_label_list(datasets["train"][label_column_name])
+        label_to_id = {l: i for i, l in enumerate(label_list)}
+    num_labels = len(label_list)
+    print(label_to_id)
+    id2tag = {}
+    for k in label_to_id.keys():
+        id2tag[label_to_id[k]] = k
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name
+        if model_args.config_name
+        else model_args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=model_args.cache_dir,
+    )
+    config.use_CRF = model_args.use_CRF  ##CR replace from arguments
+    config.use_BiLSTM = False
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name
+        if model_args.tokenizer_name
+        else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        add_prefix_space=True,
+    )
+    model = MODEL_DICT[data_args.task_name][
+        model_args.model_family_name
+    ].from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+    model.freeze_encoder_layer()
+    print("model")
+    print(model)
+    if tokenizer.pad_token is None:
+
+        tokenizer.pad_token = tokenizer.eos_token
+        config.pad_token_id = config.eos_token_id
+
+    # Tokenizer check: this script requires a fast tokenizer.
+    # if not isinstance(tokenizer, PreTrainedTokenizerFast):
+    #     raise ValueError(
+    #         "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+    #         "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
+    #         "requirement"
+    #     )
+
+    # Preprocessing the dataset
+    # Padding strategy
+    padding = "max_length" if data_args.pad_to_max_length else False
+
+    # Tokenize all texts and align the labels with them.
+    def tokenize_and_align_labels(examples):
+        tokenized_inputs = tokenizer(
+            examples[text_column_name],
+            padding=padding,
+            truncation=True,
+            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
+            is_split_into_words=True,
+        )
+        labels = []
+        for i, label in enumerate(examples[label_column_name]):
+            word_ids = tokenized_inputs.word_ids(batch_index=i)
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+                # ignored in the loss function.
+                if word_idx is None:
+                    # label_ids.append(-100)
+                    label_ids.append(
+                        2
+                    )  # to avoid error change -100 to 'O' tag i.e. 2 class
+                # We set the label for the first token of each word.
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_to_id[label[word_idx]])
+                # For the other tokens in a word, we set the label to either the current label or -100, depending on
+                # the label_all_tokens flag.
+                else:
+                    label_ids.append(
+                        label_to_id[label[word_idx]]
+                        if data_args.label_all_tokens
+                        else -100
+                    )
+                    # to avoid error change -100 to 'O' tag i.e. 2 class
+                    # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2)
+                previous_word_idx = word_idx
+
+            labels.append(label_ids)
+        tokenized_inputs["labels"] = labels
+        return tokenized_inputs
+
+    tokenized_datasets = datasets.map(
+        tokenize_and_align_labels,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+        # cache_file_name= data_args.cache_file_name
+    )
+
+    # Data collator
+    data_collator = DataCollatorForTokenClassification(
+        tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
+    )
+
+    from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+
+    def compute_metrics(p):
+        predictions, labels = p
+        # print(predictions.shape, labels.shape)
+        # if model_args.use_CRF is False:
+        predictions = np.argmax(predictions, axis=2)
+
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+        true_labels = [
+            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+
+        # results = metric.compute(predictions=true_predictions, references=true_labels)
+        results = {}
+        results["overall_precision"] = precision_score(true_labels, true_predictions)
+        results["overall_recall"] = recall_score(true_labels, true_predictions)
+        results["overall_f1"] = f1_score(true_labels, true_predictions)
+        results["overall_accuracy"] = accuracy_score(true_labels, true_predictions)
+        if data_args.return_entity_level_metrics:
+            # Unpack nested dictionaries
+            final_results = {}
+            for key, value in results.items():
+                if isinstance(value, dict):
+                    for n, v in value.items():
+                        final_results[f"{key}_{n}"] = v
+                else:
+                    final_results[key] = value
+            return final_results
+        else:
+            return {
+                "precision": results["overall_precision"],
+                "recall": results["overall_recall"],
+                "f1": results["overall_f1"],
+                "accuracy": results["overall_accuracy"],
+            }
+
+    # Initialize our Trainer
+
+    trainer = TRAINER_DICT[data_args.task_name](
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=tokenized_datasets["validation"]
+        if training_args.do_eval
+        else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+
+    # Training
+    if training_args.do_train:
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(
+                os.path.join(training_args.output_dir, "trainer_state.json")
+            )
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+
+        logger.info("*** Evaluate ***")
+
+        results = trainer.evaluate()
+
+        output_eval_file = os.path.join(
+            training_args.output_dir, "eval_results_KPE.txt"
+        )
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in results.items():
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    def get_kp_from_BIO(examples, prediction):
+        kps = []
+        for i in range(len(prediction)):
+            ids = examples["input_ids"][i]
+
+            tags = prediction[i]
+            # print(tags)
+            current_kps = []
+            ckp = []
+            for i, tag in enumerate(tags):
+                id = ids[i]
+
+                if tag == "O" and len(ckp) > 0:
+
+                    current_kps.append(ckp)
+                    ckp = []
+                elif tag == "B":
+                    # print(ckp, tag)
+                    if tokenizer.convert_ids_to_tokens(id).startswith("##"):
+                        ckp.append(id)
+                    else:
+                        if len(ckp) > 0:
+                            current_kps.append(ckp)
+                            ckp = []
+
+                        ckp.append(id)
+                        # print(ckp, id)
+
+                elif tag == "I" and len(ckp) > 0:
+                    ckp.append(id)
+            decoded_kps = []
+            if len(ckp) > 0:
+                current_kps.append(ckp)
+            if len(current_kps) > 0:
+                decoded_kps = tokenizer.batch_decode(
+                    current_kps,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=True,
+                )
+                # print(decoded_kps)
+            kps.append(decoded_kps)
+
+        # examples['predicted_kp']= kps
+        return kps
+
+    # Predict
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        test_dataset = tokenized_datasets["test"]
+        predictions, labels, metrics = trainer.predict(test_dataset)
+        # if model_args.use_CRF is False:
+        predictions = np.argmax(predictions, axis=2)
+
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+
+        output_test_results_file = os.path.join(
+            training_args.output_dir, "test_results.txt"
+        )
+        if trainer.is_world_process_zero():
+            with open(output_test_results_file, "w") as writer:
+                for key, value in sorted(metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+        # Save predictions
+        output_test_predictions_file = os.path.join(
+            training_args.output_dir, "test_predictions.txt"
+        )
+        if trainer.is_world_process_zero():
+            # # test_dataset['predicted_tags']= true_predictions
+            # # test_dataset=test_dataset.map(get_kp_from_BIO,batched=True,
+            # num_proc=data_args.preprocessing_num_workers,
+            # load_from_cache_file=not data_args.overwrite_cache,
+            #  )
+            gen_kps = get_kp_from_BIO(test_dataset, true_predictions)
+            with open(output_test_predictions_file, "w") as writer:
+                for prediction in gen_kps:
+                    writer.write(" <kp_sep> ".join(prediction) + "\n")
+
+    return results
diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
index e69de29..da26337 100644
--- a/dlkp/models/ke/transformer/crf_models.py
+++ b/dlkp/models/ke/transformer/crf_models.py
@@ -0,0 +1,152 @@
+# all token classification model with crf head
+from transformers import (
+    AutoModelForPreTraining,
+    AutoModel,
+    BertModel,
+    BertPreTrainedModel,
+    LongformerModel,
+    PreTrainedModel,
+)
+from transformers.modeling_outputs import TokenClassifierOutput
+import collections
+from crf import ConditionalRandomField
+from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel
+
+
+class BERT_CRFforTokenClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # self.crf= nn.Linear(config.num_labels,1)
+        # self.crf= ConditionalRandomField(self.num_labels)
+        self.crf = ConditionalRandomField(
+            self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"}
+        )
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = -self.crf(logits, labels, attention_mask)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        # print(self.crf.transitions)
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def freeze_till_clf(self):
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        for param in self.dropout.parameters():
+            param.requires_grad = False
+        for param in self.classifier.parameters():
+            param.requires_grad = False
+
+    def freeze_encoder_layer(self):
+        for param in self.bert.parameters():
+            param.requires_grad = False
+
+
+class Longformer_CRFforTokenClassification(LongformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.longformer = LongformerModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # self.crf= nn.Linear(config.num_labels,1)
+        self.crf = ConditionalRandomField(self.num_labels)
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.longformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = -self.crf(logits, labels, attention_mask)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def freeze_till_clf(self):
+        for param in self.longformer.parameters():
+            param.requires_grad = False
+        for param in self.dropout.parameters():
+            param.requires_grad = False
+        for param in self.classifier.parameters():
+            param.requires_grad = False
+
+    def freeze_encoder_layer(self):
+        for param in self.longformer.parameters():
+            param.requires_grad = False
diff --git a/dlkp/models/ke/transformer/token_classification_models.py b/dlkp/models/ke/transformer/token_classification_models.py
index e69de29..d3b628c 100644
--- a/dlkp/models/ke/transformer/token_classification_models.py
+++ b/dlkp/models/ke/transformer/token_classification_models.py
@@ -0,0 +1,94 @@
+# all models with token classification only
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from datasets import ClassLabel, load_dataset, load_metric
+
+from transformers import (
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    AutoModel,
+    DataCollatorForTokenClassification,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+    LongformerForTokenClassification,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+
+
+logger = logging.getLogger(__name__)
+
+from transformers.models.reformer.modeling_reformer import *
+
+
+class ReformerForTokenClassification(ReformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.reformer = ReformerModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        num_hashes=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.reformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            num_hashes=num_hashes,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss,
+                    labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels),
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        # if not return_dict:
+        output = (logits,) + outputs[2:]
+        return ((loss,) + output) if loss is not None else output
+
+
+s
diff --git a/notebooks/tranKP.ipynb b/notebooks/tranKP.ipynb
index 7835b24..a189ed5 100644
--- a/notebooks/tranKP.ipynb
+++ b/notebooks/tranKP.ipynb
@@ -1,20 +1,8 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "tranKP.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
   "cells": [
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -22,90 +10,20 @@
         "id": "HXDqBqrdaoNw",
         "outputId": "4e45f4d4-324f-44a0-f289-62479ccd56ef"
       },
+      "outputs": [],
       "source": [
         "!pip install transformers\n",
         "!pip install sentencepiece\n",
         "!pip install datasets"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Collecting transformers\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)\n",
-            "\u001b[K     |████████████████████████████████| 1.5MB 4.2MB/s \n",
-            "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.19.4)\n",
-            "Collecting sacremoses\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n",
-            "\u001b[K     |████████████████████████████████| 890kB 17.1MB/s \n",
-            "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.8)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n",
-            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n",
-            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n",
-            "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n",
-            "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8)\n",
-            "Collecting tokenizers==0.9.4\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)\n",
-            "\u001b[K     |████████████████████████████████| 2.9MB 21.5MB/s \n",
-            "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n",
-            "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n",
-            "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.0.0)\n",
-            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n",
-            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n",
-            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n",
-            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.12.5)\n",
-            "Building wheels for collected packages: sacremoses\n",
-            "  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=5eee5bbfe2f9124d4f5d0c0332e4124d253f5989149682918253b6700d942717\n",
-            "  Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n",
-            "Successfully built sacremoses\n",
-            "Installing collected packages: sacremoses, tokenizers, transformers\n",
-            "Successfully installed sacremoses-0.0.43 tokenizers-0.9.4 transformers-4.1.1\n",
-            "Collecting sentencepiece\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)\n",
-            "\u001b[K     |████████████████████████████████| 1.1MB 5.9MB/s \n",
-            "\u001b[?25hInstalling collected packages: sentencepiece\n",
-            "Successfully installed sentencepiece-0.1.94\n",
-            "Collecting datasets\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ee/78/5873ac1e27bf25a2cbf3447d6704edd3136b1b3ff0eb3bfab38a45d2a1ff/datasets-1.2.0-py3-none-any.whl (159kB)\n",
-            "\u001b[K     |████████████████████████████████| 163kB 4.1MB/s \n",
-            "\u001b[?25hRequirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from datasets) (0.8)\n",
-            "Requirement already satisfied: tqdm<4.50.0,>=4.27 in /usr/local/lib/python3.6/dist-packages (from datasets) (4.41.1)\n",
-            "Collecting xxhash\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)\n",
-            "\u001b[K     |████████████████████████████████| 245kB 6.1MB/s \n",
-            "\u001b[?25hCollecting pyarrow>=0.17.1\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d7/e1/27958a70848f8f7089bff8d6ebe42519daf01f976d28b481e1bfd52c8097/pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7MB)\n",
-            "\u001b[K     |████████████████████████████████| 17.7MB 1.5MB/s \n",
-            "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from datasets) (0.3.3)\n",
-            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.6/dist-packages (from datasets) (1.19.4)\n",
-            "Requirement already satisfied: multiprocess in /usr/local/lib/python3.6/dist-packages (from datasets) (0.70.11.1)\n",
-            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.23.0)\n",
-            "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from datasets) (1.1.5)\n",
-            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2020.12.5)\n",
-            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
-            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n",
-            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2.8.1)\n",
-            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2018.9)\n",
-            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n",
-            "Installing collected packages: xxhash, pyarrow, datasets\n",
-            "  Found existing installation: pyarrow 0.14.1\n",
-            "    Uninstalling pyarrow-0.14.1:\n",
-            "      Successfully uninstalled pyarrow-0.14.1\n",
-            "Successfully installed datasets-1.2.0 pyarrow-2.0.0 xxhash-2.0.0\n"
-          ],
-          "name": "stdout"
-        }
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "r3IcbMLKYCz7"
       },
+      "outputs": [],
       "source": [
         "# utils\n",
         "import os, sys\n",
@@ -166,15 +84,15 @@
         "        default= \"single\",\n",
         "        metadata= {\"help\": \"single | multiple , type of dataset reader to use, split train data into mltiple train file or from single\" }\n",
         "    )"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "TgUthX_SaXRY"
       },
+      "outputs": [],
       "source": [
         "#datset\n",
         "import os, sys\n",
@@ -275,15 +193,15 @@
         "\n",
         "\n",
         "\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "zL3jsUs2aXUI"
       },
+      "outputs": [],
       "source": [
         "#collate\n",
         "import os, sys\n",
@@ -340,15 +258,15 @@
         "        prev_output_tokens[:, 0] = decoder_start_tokens\n",
         "        return prev_output_tokens\n",
         "\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "W8r6v-NJaXWs"
       },
+      "outputs": [],
       "source": [
         "#main\n",
         "import os, sys\n",
@@ -482,15 +400,15 @@
         "\n",
         "    \n",
         "\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "SognzQU6aXY7"
       },
+      "outputs": [],
       "source": [
         "def runner():\n",
         "    args= BasicKPArgs(\n",
@@ -513,12 +431,11 @@
         "\n",
         "    )\n",
         "    main_fn(args, training_args)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "background_save": true,
@@ -528,12 +445,9 @@
         "id": "EMXA4Yi6aXbV",
         "outputId": "f091bbbd-d403-48a9-8089-ce746dd66cb3"
       },
-      "source": [
-        "runner()"
-      ],
-      "execution_count": null,
       "outputs": [
         {
+          "name": "stderr",
           "output_type": "stream",
           "text": [
             "Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n",
@@ -547,11 +461,9 @@
             "Using custom data configuration default\n",
             "Reusing dataset json (/root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n",
             "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-cea35ead7b156669.arrow\n"
-          ],
-          "name": "stderr"
+          ]
         },
         {
-          "output_type": "display_data",
           "data": {
             "text/html": [
               "\n",
@@ -587,12 +499,13 @@
           },
           "metadata": {
             "tags": []
-          }
+          },
+          "output_type": "display_data"
         },
         {
-          "output_type": "error",
           "ename": "RuntimeError",
           "evalue": "ignored",
+          "output_type": "error",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
@@ -612,13 +525,18 @@
             "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:274] . unexpected pos 1606179904 vs 1606179792"
           ]
         }
+      ],
+      "source": [
+        "runner()"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "5CaaHimjIfOR"
       },
+      "outputs": [],
       "source": [
         "# 3 eval_kp.py\n",
         "import os, sys\n",
@@ -737,15 +655,15 @@
         "\n",
         "\n",
         "\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "BP6VdOBZIfRi"
       },
+      "outputs": [],
       "source": [
         "args= EvalArgs(\n",
         "        model_type= 't5',\n",
@@ -760,42 +678,47 @@
         "\n",
         "    )\n",
         "main_eval(args)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "iU7vhi4nbnmx"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "joA6Zo0SIfUf"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "iVByVZnaaXdl"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "tranKP.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

From 41a723c79494274593f28d2e5c08951787c8f9a2 Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Tue, 25 Jan 2022 00:22:25 +0530
Subject: [PATCH 04/17] add utils

---
 dlkp/models/ke/crf/__init__.py     |   0
 dlkp/models/ke/extraction_utils.py | 152 ++++++++++++++++++++++++++
 dlkp/models/ke/kpe.py              | 168 ++---------------------------
 3 files changed, 159 insertions(+), 161 deletions(-)
 create mode 100644 dlkp/models/ke/crf/__init__.py

diff --git a/dlkp/models/ke/crf/__init__.py b/dlkp/models/ke/crf/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py
index e69de29..7f34719 100644
--- a/dlkp/models/ke/extraction_utils.py
+++ b/dlkp/models/ke/extraction_utils.py
@@ -0,0 +1,152 @@
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_family_name: str = field(
+        metadata={
+            "help": "name of the family of model, bert, longformer, reformer etc."
+        }
+    )
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        }
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained config name or path if not the same as model_name"
+        },
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name"
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={
+            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
+        },
+    )
+    use_CRF: bool = field(
+        default=False,
+        metadata={"help": "wether to use CRF on top of the classifier"},
+    )
+    use_BiLSTM: bool = field(
+        default=False,
+        metadata={"help": "use BiLSTM in sequence classification"},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: Optional[str] = field(
+        default="simple", metadata={"help": "The name of the task simple, crf"}
+    )
+
+    train_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "The input training data file (a csv or JSON file)."},
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to predict on (a csv or JSON file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    label_all_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
+            "one (in which case the other tokens will have a padding index)."
+        },
+    )
+    return_entity_level_metrics: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to return all the entity levels during evaluation or just the overall ones."
+        },
+    )
+    dataset_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the dataset to use (via the datasets library)."},
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the dataset to use (via the datasets library)."
+        },
+    )
+    cache_file_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+        ):
+            raise ValueError(
+                "Need either a dataset name or a training/validation file."
+            )
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`validation_file` should be a csv or a json file."
+        self.task_name = self.task_name.lower()
diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index 29e4287..7991062 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -45,181 +45,27 @@
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 
+from transformer.crf_models import BERT_CRFforTokenClassification
+from transformer.token_classification_models import LongformerForTokenClassification
+from crf.crf_trainer import CRF_Trainer
+from extraction_utils import ModelArguments, DataTrainingArguments
 
 logger = logging.getLogger(__name__)
-# from models.long_doc_kp_models import LONG_DOC_KP_MODELS
 
-# KPE_MODELS_DICT={
-#     'others': AutoModelForTokenClassification,
-#     "longformer":
-#     'reformer' :
-#     'crf_longformer' :
-#     'crf_bert': BERT_CRFforTokenClassification
-# }
 
 CRF_MODEL_DICT = {
     "bert": BERT_CRFforTokenClassification,
-    "longformer": Longformer_CRFforTokenClassification,
+    # "longformer": Longformer_CRFforTokenClassification,
 }
 TOKEN_MODEL_DICT = {
     "bert": BertForTokenClassification,
-    "longformer": LongformerForTokenClassification,
-    "reformer": ReformerForTokenClassification,
+    # "longformer": LongformerForTokenClassification,
+    # "reformer": ReformerForTokenClassification,
 }
 
 MODEL_DICT = {"crf": CRF_MODEL_DICT, "simple": TOKEN_MODEL_DICT}
 
-# KPE_MODELS_DICT = KPE_MODELS_DICT | LONG_DOC_KP_MODELS
 
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_family_name: str = field(
-        metadata={
-            "help": "name of the family of model, bert, longformer, reformer etc."
-        }
-    )
-    model_name_or_path: str = field(
-        metadata={
-            "help": "Path to pretrained model or model identifier from huggingface.co/models"
-        }
-    )
-    config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Pretrained config name or path if not the same as model_name"
-        },
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Pretrained tokenizer name or path if not the same as model_name"
-        },
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
-        },
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={
-            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
-        },
-    )
-    use_CRF: bool = field(
-        default=False,
-        metadata={"help": "wether to use CRF on top of the classifier"},
-    )
-    use_BiLSTM: bool = field(
-        default=False,
-        metadata={"help": "use BiLSTM in sequence classification"},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    task_name: Optional[str] = field(
-        default="simple", metadata={"help": "The name of the task simple, crf"}
-    )
-
-    train_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "The input training data file (a csv or JSON file)."},
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."
-        },
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input test data file to predict on (a csv or JSON file)."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False,
-        metadata={"help": "Overwrite the cached training and evaluation sets"},
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to model maximum sentence length. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-            "efficient on GPU but very bad for TPU."
-        },
-    )
-    label_all_tokens: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
-            "one (in which case the other tokens will have a padding index)."
-        },
-    )
-    return_entity_level_metrics: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to return all the entity levels during evaluation or just the overall ones."
-        },
-    )
-    dataset_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the dataset to use (via the datasets library)."},
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The configuration name of the dataset to use (via the datasets library)."
-        },
-    )
-    cache_file_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name."
-        },
-    )
-
-    def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-        ):
-            raise ValueError(
-                "Need either a dataset name or a training/validation file."
-            )
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in [
-                    "csv",
-                    "json",
-                ], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in [
-                    "csv",
-                    "json",
-                ], "`validation_file` should be a csv or a json file."
-        self.task_name = self.task_name.lower()
-
-
-# def main():
 TRAINER_DICT = {
     "crf": CRF_Trainer,
     "simple": Trainer,

From 3bf5026462728e60544791973f894b177d47414c Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Wed, 26 Jan 2022 21:51:26 +0530
Subject: [PATCH 05/17] f formatting and re arch

---
 dlkp/kp_metrics/__init__.py                  |    0
 dlkp/models/ke/kpe.py                        |  250 +-
 examples/dataset/hf_data_script.py           |   80 +-
 examples/dataset/hf_data_script_long_docs.py |  116 +-
 examples/ke/ke_sequence_tagging.py           |  156 +-
 ldkp_amardeep.py                             | 2919 ++++++++++++++++++
 6 files changed, 3300 insertions(+), 221 deletions(-)
 create mode 100644 dlkp/kp_metrics/__init__.py
 create mode 100644 ldkp_amardeep.py

diff --git a/dlkp/kp_metrics/__init__.py b/dlkp/kp_metrics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index 7991062..556048b 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -75,8 +75,7 @@
 def main_run_kpe(model_args, data_args, training_args):
 
     # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
+   
 
     # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
@@ -112,8 +111,9 @@ def main_run_kpe(model_args, data_args, training_args):
         handlers=[logging.StreamHandler(sys.stdout)],
     )
     logger.setLevel(
-        logging.INFO if is_main_process(training_args.local_rank) else logging.WARN
+        logging.INFO if is_main_process(training_args.local_rank) else logging.INFO
     )
+    # logger.set_global_logging_level(logging.INFO)
 
     # Log on each process the small summary:
     logger.warning(
@@ -147,11 +147,13 @@ def main_run_kpe(model_args, data_args, training_args):
         data_files = {}
         if data_args.train_file is not None:
             data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
         if data_args.validation_file is not None:
             data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
         if data_args.test_file is not None:
             data_files["test"] = data_args.test_file
-        extension = data_args.train_file.split(".")[-1]
+            extension = data_args.test_file.split(".")[-1]
         datasets = load_dataset(
             extension, data_files=data_files
         )  ##CR get dataset in here
@@ -182,7 +184,11 @@ def get_label_list(labels):
         # No need to convert the labels since they are already ints.
         label_to_id = {i: i for i in range(len(label_list))}
     else:
-        label_list = get_label_list(datasets["train"][label_column_name])
+        label_list = get_label_list(
+            datasets["train"][label_column_name]
+            if training_args.do_train
+            else datasets["validation"][label_column_name]
+        )
         label_to_id = {l: i for i, l in enumerate(label_list)}
     num_labels = len(label_list)
     print(label_to_id)
@@ -218,9 +224,9 @@ def get_label_list(labels):
         config=config,
         cache_dir=model_args.cache_dir,
     )
-    model.freeze_encoder_layer()
+    # model.freeze_encoder_layer()
     print("model")
-    print(model)
+    # print(model)
     if tokenizer.pad_token is None:
 
         tokenizer.pad_token = tokenizer.eos_token
@@ -276,7 +282,12 @@ def tokenize_and_align_labels(examples):
                 previous_word_idx = word_idx
 
             labels.append(label_ids)
+        if data_args.task_name == "guided":
+            tokenized_inputs["guide_embed"] = examples["guide_embed"]
         tokenized_inputs["labels"] = labels
+        # tokenized_inputs['paper_id']= examples['paper_id']
+        # tokenized_inputs['extractive_keyphrases']= examples['extractive_keyphrases']
+
         return tokenized_inputs
 
     tokenized_datasets = datasets.map(
@@ -293,6 +304,7 @@ def tokenize_and_align_labels(examples):
     )
 
     from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+    from seqeval.scheme import IOB2, IOB1
 
     def compute_metrics(p):
         predictions, labels = p
@@ -312,13 +324,22 @@ def compute_metrics(p):
 
         # results = metric.compute(predictions=true_predictions, references=true_labels)
         results = {}
-        results["overall_precision"] = precision_score(true_labels, true_predictions)
-        results["overall_recall"] = recall_score(true_labels, true_predictions)
-        results["overall_f1"] = f1_score(true_labels, true_predictions)
+        # print("cal precisi")
+        results["overall_precision"] = precision_score(
+            true_labels, true_predictions, mode="strict", scheme=IOB2
+        )
+        results["overall_recall"] = recall_score(
+            true_labels, true_predictions, mode="strict", scheme=IOB2
+        )
+        # print("cal f1")
+        results["overall_f1"] = f1_score(
+            true_labels, true_predictions, mode="strict", scheme=IOB2
+        )
         results["overall_accuracy"] = accuracy_score(true_labels, true_predictions)
         if data_args.return_entity_level_metrics:
             # Unpack nested dictionaries
             final_results = {}
+            # print("cal entity level mat")
             for key, value in results.items():
                 if isinstance(value, dict):
                     for n, v in value.items():
@@ -335,6 +356,40 @@ def compute_metrics(p):
             }
 
     # Initialize our Trainer
+    # metric = load_metric("seqeval")
+
+    # def compute_metrics(p):
+    #     predictions, labels = p
+    #     predictions = np.argmax(predictions, axis=2)
+
+    #     # Remove ignored index (special tokens)
+    #     true_predictions = [
+    #         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+    #         for prediction, label in zip(predictions, labels)
+    #     ]
+    #     true_labels = [
+    #         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+    #         for prediction, label in zip(predictions, labels)
+    #     ]
+
+    #     results = metric.compute(predictions=true_predictions, references=true_labels)
+    #     if data_args.return_entity_level_metrics:
+    #         # Unpack nested dictionaries
+    #         final_results = {}
+    #         for key, value in results.items():
+    #             if isinstance(value, dict):
+    #                 for n, v in value.items():
+    #                     final_results[f"{key}_{n}"] = v
+    #             else:
+    #                 final_results[key] = value
+    #         return final_results
+    #     else:
+    #         return {
+    #             "precision": results["overall_precision"],
+    #             "recall": results["overall_recall"],
+    #             "f1": results["overall_f1"],
+    #             "accuracy": results["overall_accuracy"],
+    #         }
 
     trainer = TRAINER_DICT[data_args.task_name](
         model=model,
@@ -374,66 +429,19 @@ def compute_metrics(p):
 
     # Evaluation
     results = {}
-    if training_args.do_eval:
+    # if training_args.do_eval:
 
-        logger.info("*** Evaluate ***")
+    #     logger.info("*** Evaluate ***")
 
-        results = trainer.evaluate()
-
-        output_eval_file = os.path.join(
-            training_args.output_dir, "eval_results_KPE.txt"
-        )
-        if trainer.is_world_process_zero():
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key, value in results.items():
-                    logger.info(f"  {key} = {value}")
-                    writer.write(f"{key} = {value}\n")
+    #     results = trainer.evaluate()
 
-    def get_kp_from_BIO(examples, prediction):
-        kps = []
-        for i in range(len(prediction)):
-            ids = examples["input_ids"][i]
-
-            tags = prediction[i]
-            # print(tags)
-            current_kps = []
-            ckp = []
-            for i, tag in enumerate(tags):
-                id = ids[i]
-
-                if tag == "O" and len(ckp) > 0:
-
-                    current_kps.append(ckp)
-                    ckp = []
-                elif tag == "B":
-                    # print(ckp, tag)
-                    if tokenizer.convert_ids_to_tokens(id).startswith("##"):
-                        ckp.append(id)
-                    else:
-                        if len(ckp) > 0:
-                            current_kps.append(ckp)
-                            ckp = []
-
-                        ckp.append(id)
-                        # print(ckp, id)
-
-                elif tag == "I" and len(ckp) > 0:
-                    ckp.append(id)
-            decoded_kps = []
-            if len(ckp) > 0:
-                current_kps.append(ckp)
-            if len(current_kps) > 0:
-                decoded_kps = tokenizer.batch_decode(
-                    current_kps,
-                    skip_special_tokens=True,
-                    clean_up_tokenization_spaces=True,
-                )
-                # print(decoded_kps)
-            kps.append(decoded_kps)
-
-        # examples['predicted_kp']= kps
-        return kps
+    #     output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt")
+    #     if trainer.is_world_process_zero():
+    #         with open(output_eval_file, "w") as writer:
+    #             logger.info("***** Eval results *****")
+    #             for key, value in results.items():
+    #                 logger.info(f"  {key} = {value}")
+    #                 writer.write(f"{key} = {value}\n")
 
     # Predict
     if training_args.do_predict:
@@ -449,6 +457,10 @@ def get_kp_from_BIO(examples, prediction):
             [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
             for prediction, label in zip(predictions, labels)
         ]
+        true_labels = [
+            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
 
         output_test_results_file = os.path.join(
             training_args.output_dir, "test_results.txt"
@@ -460,18 +472,102 @@ def get_kp_from_BIO(examples, prediction):
                     writer.write(f"{key} = {value}\n")
 
         # Save predictions
+        def get_kp_from_BIO(examples, i):
+            # kps= []
+            # for i in range(len(prediction)):
+            ids = examples["input_ids"]
+            # print(examples.keys())
+
+            # print(tags)
+            def mmkp(tag_):
+                current_kps = []
+                ckp = []
+                prev_tag = None
+                for j, tag in enumerate(tag_):
+                    id = ids[j]
+
+                    if tag == "O" and len(ckp) > 0:
+
+                        current_kps.append(ckp)
+                        ckp = []
+                    elif tag == "B":
+                        # print(ckp, tag)
+                        if (
+                            tokenizer.convert_ids_to_tokens(id).startswith("##")
+                            or prev_tag == "B"
+                        ):
+                            ckp.append(id)
+                        else:
+                            if len(ckp) > 0:
+                                current_kps.append(ckp)
+                                ckp = []
+
+                            ckp.append(id)
+                            # print(ckp, id)
+
+                    elif tag == "I" and len(ckp) > 0:
+                        ckp.append(id)
+                    prev_tag = tag
+                decoded_kps = []
+                if len(ckp) > 0:
+                    current_kps.append(ckp)
+                if len(current_kps) > 0:
+                    decoded_kps = tokenizer.batch_decode(
+                        current_kps,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=True,
+                    )
+                    # print(decoded_kps)
+                return decoded_kps
+
+            tags = true_predictions[i]
+            decoded_kps = mmkp(tags)
+
+            ttgs = true_labels[i]
+            eekp = mmkp(ttgs)
+
+            # examples['kp_predicted']= decoded_kps
+            examples["kp_predicted"] = list(dict.fromkeys(decoded_kps))
+            examples["eekp"] = list(dict.fromkeys(eekp))
+            # examples['eekp']= eekp
+            # else:
+            #     examples['kp_predicted']= ['<dummy_kp>']
+            examples["id"] = i
+            return examples
+
+        import pandas as pd
+
         output_test_predictions_file = os.path.join(
-            training_args.output_dir, "test_predictions.txt"
+            training_args.output_dir, "test_predictions.csv"
+        )
+        output_test_predictions_BIO_file = os.path.join(
+            training_args.output_dir, "test_predictions_BIO.txt"
         )
         if trainer.is_world_process_zero():
-            # # test_dataset['predicted_tags']= true_predictions
-            # # test_dataset=test_dataset.map(get_kp_from_BIO,batched=True,
-            # num_proc=data_args.preprocessing_num_workers,
-            # load_from_cache_file=not data_args.overwrite_cache,
-            #  )
-            gen_kps = get_kp_from_BIO(test_dataset, true_predictions)
-            with open(output_test_predictions_file, "w") as writer:
-                for prediction in gen_kps:
-                    writer.write(" <kp_sep> ".join(prediction) + "\n")
+            print(test_dataset, len(test_dataset["paper_id"]))
+            ppid = test_dataset["paper_id"]
+            # ekp= test_dataset['extractive_keyphrases']
+
+            test_dataset = test_dataset.map(
+                get_kp_from_BIO,
+                num_proc=data_args.preprocessing_num_workers,
+                with_indices=True,
+            )
+            #  input_columns= ['paper_id','input_ids','extractive_keyphrases']
+            print(test_dataset, " agian")
+            df = pd.DataFrame.from_dict(
+                {
+                    "id": ppid,
+                    "extractive_keyphrase": test_dataset["eekp"],
+                    "keyphrases": test_dataset["kp_predicted"],
+                }
+            )
+            df.to_csv(output_test_predictions_file, index=False)
+
+            # get BIO tag files
+
+            with open(output_test_predictions_BIO_file, "w") as writer:
+                for prediction in true_predictions:
+                    writer.write(" ".join(prediction) + "\n")
 
     return results
diff --git a/examples/dataset/hf_data_script.py b/examples/dataset/hf_data_script.py
index 4355a78..a9404c2 100644
--- a/examples/dataset/hf_data_script.py
+++ b/examples/dataset/hf_data_script.py
@@ -27,11 +27,7 @@
 
 # TODO: Add link to the official dataset URLs here
 
-_URLS = {
-    "test": "test.jsonl",
-    "train": "train.jsonl",
-    "valid": "valid.jsonl"
-}
+_URLS = {"test": "test.jsonl", "train": "train.jsonl", "valid": "valid.jsonl"}
 
 
 # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
@@ -41,23 +37,36 @@ class KPTimes(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="extraction", version=VERSION,
-                               description="This part of my dataset covers extraction"),
-        datasets.BuilderConfig(name="generation", version=VERSION,
-                               description="This part of my dataset covers generation"),
-        datasets.BuilderConfig(name="raw", version=VERSION, description="This part of my dataset covers the raw dataset"),
+        datasets.BuilderConfig(
+            name="extraction",
+            version=VERSION,
+            description="This part of my dataset covers extraction",
+        ),
+        datasets.BuilderConfig(
+            name="generation",
+            version=VERSION,
+            description="This part of my dataset covers generation",
+        ),
+        datasets.BuilderConfig(
+            name="raw",
+            version=VERSION,
+            description="This part of my dataset covers the raw dataset",
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "extraction"
 
     def _info(self):
-        if self.config.name == "extraction":  # This is the name of the configuration selected in BUILDER_CONFIGS above
+        if (
+            self.config.name == "extraction"
+        ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
             features = datasets.Features(
                 {
                     "id": datasets.Value("string"),
                     "document": datasets.features.Sequence(datasets.Value("string")),
-                    "doc_bio_tags": datasets.features.Sequence(datasets.Value("string"))
-
+                    "doc_bio_tags": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
                 }
             )
         elif self.config.name == "generation":
@@ -65,9 +74,12 @@ def _info(self):
                 {
                     "id": datasets.Value("string"),
                     "document": datasets.features.Sequence(datasets.Value("string")),
-                    "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")),
-                    "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string"))
-
+                    "extractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "abstractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
                 }
             )
         else:
@@ -75,9 +87,15 @@ def _info(self):
                 {
                     "id": datasets.Value("string"),
                     "document": datasets.features.Sequence(datasets.Value("string")),
-                    "doc_bio_tags": datasets.features.Sequence(datasets.Value("string")),
-                    "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")),
-                    "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string")),
+                    "doc_bio_tags": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "extractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "abstractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
                     "other_metadata": datasets.features.Sequence(
                         {
                             "id": datasets.Value("string"),
@@ -87,8 +105,7 @@ def _info(self):
                             "abstract": datasets.Value("string"),
                             "keyword": datasets.Value("string"),
                         }
-                    )
-
+                    ),
                 }
             )
         return datasets.DatasetInfo(
@@ -111,23 +128,20 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": data_dir['train'],
+                    "filepath": data_dir["train"],
                     "split": "train",
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir['test'],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": data_dir['valid'],
+                    "filepath": data_dir["valid"],
                     "split": "valid",
                 },
             ),
@@ -141,23 +155,23 @@ def _generate_examples(self, filepath, split):
                 if self.config.name == "extraction":
                     # Yields examples as (key, example) tuples
                     yield key, {
-                        "id": data.get('paper_id'),
+                        "id": data.get("paper_id"),
                         "document": data["document"],
-                        "doc_bio_tags": data.get("doc_bio_tags")
+                        "doc_bio_tags": data.get("doc_bio_tags"),
                     }
                 elif self.config.name == "generation":
                     yield key, {
-                        "id": data.get('paper_id'),
+                        "id": data.get("paper_id"),
                         "document": data["document"],
                         "extractive_keyphrases": data.get("extractive_keyphrases"),
-                        "abstractive_keyphrases": data.get("abstractive_keyphrases")
+                        "abstractive_keyphrases": data.get("abstractive_keyphrases"),
                     }
                 else:
                     yield key, {
-                        "id": data.get('paper_id'),
+                        "id": data.get("paper_id"),
                         "document": data["document"],
                         "doc_bio_tags": data.get("doc_bio_tags"),
                         "extractive_keyphrases": data.get("extractive_keyphrases"),
                         "abstractive_keyphrases": data.get("abstractive_keyphrases"),
-                        "other_metadata": data["other_metadata"]
+                        "other_metadata": data["other_metadata"],
                     }
diff --git a/examples/dataset/hf_data_script_long_docs.py b/examples/dataset/hf_data_script_long_docs.py
index cee1e22..d9ff5cc 100644
--- a/examples/dataset/hf_data_script_long_docs.py
+++ b/examples/dataset/hf_data_script_long_docs.py
@@ -22,11 +22,7 @@
 
 # TODO: Add link to the official dataset URLs here
 
-_URLS = {
-    "test": "test.jsonl",
-    "train": "train.jsonl",
-    "valid": "valid.jsonl"
-}
+_URLS = {"test": "test.jsonl", "train": "train.jsonl", "valid": "valid.jsonl"}
 
 
 # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
@@ -36,28 +32,46 @@ class TestLDKP(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("0.1")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="extraction", version=VERSION,
-                               description="This part of my dataset covers extraction"),
-        datasets.BuilderConfig(name="generation", version=VERSION,
-                               description="This part of my dataset covers generation"),
-        datasets.BuilderConfig(name="raw", version=VERSION, description="This part of my dataset covers the raw dataset"),
-        datasets.BuilderConfig(name="ldkp_generation", version=VERSION,
-                               description="This part of my dataset covers abstract only"),
-        datasets.BuilderConfig(name="ldkp_extraction", version=VERSION,
-                               description="This part of my dataset covers abstract only"),
-
+        datasets.BuilderConfig(
+            name="extraction",
+            version=VERSION,
+            description="This part of my dataset covers extraction",
+        ),
+        datasets.BuilderConfig(
+            name="generation",
+            version=VERSION,
+            description="This part of my dataset covers generation",
+        ),
+        datasets.BuilderConfig(
+            name="raw",
+            version=VERSION,
+            description="This part of my dataset covers the raw dataset",
+        ),
+        datasets.BuilderConfig(
+            name="ldkp_generation",
+            version=VERSION,
+            description="This part of my dataset covers abstract only",
+        ),
+        datasets.BuilderConfig(
+            name="ldkp_extraction",
+            version=VERSION,
+            description="This part of my dataset covers abstract only",
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "extraction"
 
     def _info(self):
-        if self.config.name == "extraction" or self.config.name == "ldkp_extraction":  # This is the name of the configuration selected in BUILDER_CONFIGS above
+        if (
+            self.config.name == "extraction" or self.config.name == "ldkp_extraction"
+        ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
             features = datasets.Features(
                 {
                     "id": datasets.Value("int64"),
                     "document": datasets.features.Sequence(datasets.Value("string")),
-                    "doc_bio_tags": datasets.features.Sequence(datasets.Value("string"))
-
+                    "doc_bio_tags": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
                 }
             )
         elif self.config.name == "generation" or self.config.name == "ldkp_generation":
@@ -65,9 +79,12 @@ def _info(self):
                 {
                     "id": datasets.Value("int64"),
                     "document": datasets.features.Sequence(datasets.Value("string")),
-                    "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")),
-                    "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string"))
-
+                    "extractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "abstractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
                 }
             )
         else:
@@ -75,16 +92,25 @@ def _info(self):
                 {
                     "id": datasets.Value("int64"),
                     "document": datasets.features.Sequence(datasets.Value("string")),
-                    "doc_bio_tags": datasets.features.Sequence(datasets.Value("string")),
-                    "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")),
-                    "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string")),
+                    "doc_bio_tags": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "extractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "abstractive_keyphrases": datasets.features.Sequence(
+                        datasets.Value("string")
+                    ),
                     "other_metadata": datasets.features.Sequence(
                         {
-                            "text": datasets.features.Sequence(datasets.Value("string")),
-                            "bio_tags": datasets.features.Sequence(datasets.Value("string"))
+                            "text": datasets.features.Sequence(
+                                datasets.Value("string")
+                            ),
+                            "bio_tags": datasets.features.Sequence(
+                                datasets.Value("string")
+                            ),
                         }
-                    )
-
+                    ),
                 }
             )
         return datasets.DatasetInfo(
@@ -107,23 +133,20 @@ def _split_generators(self, dl_manager):
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": data_dir['train'],
+                    "filepath": data_dir["train"],
                     "split": "train",
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dir['test'],
-                    "split": "test"
-                },
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": data_dir['valid'],
+                    "filepath": data_dir["valid"],
                     "split": "valid",
                 },
             ),
@@ -137,36 +160,37 @@ def _generate_examples(self, filepath, split):
                 if self.config.name == "extraction":
                     # Yields examples as (key, example) tuples
                     yield key, {
-                        "id": data['paper_id'],
+                        "id": data["paper_id"],
                         "document": data["document"],
-                        "doc_bio_tags": data["doc_bio_tags"]
+                        "doc_bio_tags": data["doc_bio_tags"],
                     }
                 elif self.config.name == "ldkp_extraction":
                     yield key, {
-                        "id": data['paper_id'],
-                        "document": data["document"] + data["other_metadata"]['text'],
-                        "doc_bio_tags": data["document_tags"] + data["other_metadata"]['bio_tags']
+                        "id": data["paper_id"],
+                        "document": data["document"] + data["other_metadata"]["text"],
+                        "doc_bio_tags": data["document_tags"]
+                        + data["other_metadata"]["bio_tags"],
                     }
                 elif self.config.name == "ldkp_generation":
                     yield key, {
-                        "id": data['paper_id'],
-                        "document": data["document"] + data["other_metadata"]['text'],
+                        "id": data["paper_id"],
+                        "document": data["document"] + data["other_metadata"]["text"],
                         "extractive_keyphrases": data["extractive_keyphrases"],
-                        "abstractive_keyphrases": data["abstractive_keyphrases"]
+                        "abstractive_keyphrases": data["abstractive_keyphrases"],
                     }
                 elif self.config.name == "generation":
                     yield key, {
-                        "id": data['paper_id'],
+                        "id": data["paper_id"],
                         "document": data["document"],
                         "extractive_keyphrases": data["extractive_keyphrases"],
-                        "abstractive_keyphrases": data["abstractive_keyphrases"]
+                        "abstractive_keyphrases": data["abstractive_keyphrases"],
                     }
                 else:
                     yield key, {
-                        "id": data['paper_id'],
+                        "id": data["paper_id"],
                         "document": data["document"],
                         "doc_bio_tags": data["doc_bio_tags"],
                         "extractive_keyphrases": data["extractive_keyphrases"],
                         "abstractive_keyphrases": data["abstractive_keyphrases"],
-                        "other_metadata": data["other_metadata"]
+                        "other_metadata": data["other_metadata"],
                     }
diff --git a/examples/ke/ke_sequence_tagging.py b/examples/ke/ke_sequence_tagging.py
index 8cdbdc0..01f68ac 100644
--- a/examples/ke/ke_sequence_tagging.py
+++ b/examples/ke/ke_sequence_tagging.py
@@ -16,7 +16,7 @@
 from transformers import get_linear_schedule_with_warmup
 
 # We'll need the BertTokenizer for doing sequence tagging with Bert
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 
 
 def get_device():
@@ -26,22 +26,22 @@ def get_device():
         # Tell PyTorch to use the GPU.
         device = torch.device("cuda")
 
-        print('There are %d GPU(s) available.' % torch.cuda.device_count())
+        print("There are %d GPU(s) available." % torch.cuda.device_count())
 
-        print('We will use the GPU:', torch.cuda.get_device_name(0))
+        print("We will use the GPU:", torch.cuda.get_device_name(0))
 
     # If not...
     else:
-        print('No GPU available, using the CPU instead.')
+        print("No GPU available, using the CPU instead.")
         device = torch.device("cpu")
 
     return device
 
 
 def format_time(elapsed):
-    '''
+    """
     Takes a time in seconds and returns a string hh:mm:ss
-    '''
+    """
     # Round to the nearest second.
     elapsed_rounded = int(round((elapsed)))
 
@@ -118,7 +118,7 @@ def prepare_tokenized_input(sentences):
     for sent in sentences:
         # Reconstruct the sentence--otherwise `tokenizer` will interpret the list
         # of string tokens as having already been tokenized by BERT.
-        sent_str = ' '.join(sent)
+        sent_str = " ".join(sent)
 
         # `encode_plus` will:
         #   (1) Tokenize the sentence.
@@ -133,19 +133,19 @@ def prepare_tokenized_input(sentences):
             max_length=50,  # Pad & truncate all sentences.
             pad_to_max_length=True,
             return_attention_mask=True,  # Construct attn. masks.
-            return_tensors='pt',  # Return pytorch tensors.
+            return_tensors="pt",  # Return pytorch tensors.
         )
 
         # Add the encoded sentence to the list.
-        input_ids.append(encoded_dict['input_ids'][0])
+        input_ids.append(encoded_dict["input_ids"][0])
 
         # And its attention mask (simply differentiates padding from non-padding).
-        attention_masks.append(encoded_dict['attention_mask'][0])
+        attention_masks.append(encoded_dict["attention_mask"][0])
 
     # Print sentence 0, now as a list of IDs.
-    print('Original: ', sentences[0])
-    print('Token IDs:', input_ids[0])
-    print('Masks:', attention_masks[0])
+    print("Original: ", sentences[0])
+    print("Token IDs:", input_ids[0])
+    print("Masks:", attention_masks[0])
 
     return input_ids, attention_masks
 
@@ -187,15 +187,17 @@ def add_null_labels(input_ids, labels, label_map):
             token_id = token_id.numpy().item()
 
             # If `[PAD]`, `[CLS]`, or `[SEP]`...
-            if (token_id == tokenizer.pad_token_id) or \
-                    (token_id == tokenizer.cls_token_id) or \
-                    (token_id == tokenizer.sep_token_id):
+            if (
+                (token_id == tokenizer.pad_token_id)
+                or (token_id == tokenizer.cls_token_id)
+                or (token_id == tokenizer.sep_token_id)
+            ):
 
                 # Assign it the null label.
                 padded_labels.append(null_label_id)
 
             # If the token string starts with "##"...
-            elif tokenizer.ids_to_tokens[token_id][0:2] == '##':
+            elif tokenizer.ids_to_tokens[token_id][0:2] == "##":
 
                 # It's a subword token, and not part of the original dataset, so
                 # assign it the null label.
@@ -218,7 +220,7 @@ def add_null_labels(input_ids, labels, label_map):
 
         # If we did this right, then the new `padded_labels` list should match
         # the length of the tokenized sentence.
-        assert (len(sen) == len(padded_labels))
+        assert len(sen) == len(padded_labels)
 
         # Store the updated labels list for this sentence.
         new_labels.append(padded_labels)
@@ -271,8 +273,8 @@ def train_model(train_data, valid_data, device):
         # Perform one full pass over the training set.
 
         print("")
-        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
-        print('Training...')
+        print("======== Epoch {:} / {:} ========".format(epoch_i + 1, epochs))
+        print("Training...")
 
         # Measure how long the training epoch takes.
         t0 = time.time()
@@ -294,7 +296,11 @@ def train_model(train_data, valid_data, device):
                 elapsed = format_time(time.time() - t0)
 
                 # Report progress.
-                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_data), elapsed))
+                print(
+                    "  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.".format(
+                        step, len(train_data), elapsed
+                    )
+                )
 
             # Unpack this training batch from our dataloader.
             #
@@ -321,10 +327,12 @@ def train_model(train_data, valid_data, device):
             # https://huggingface.co/transformers/model_doc/bert.html#bertfortokenclassification
             # The results are returned in a results object, documented here:
             # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.TokenClassifierOutput
-            result = model(b_input_ids,
-                           token_type_ids=None,
-                           attention_mask=b_input_mask,
-                           labels=b_labels)
+            result = model(
+                b_input_ids,
+                token_type_ids=None,
+                attention_mask=b_input_mask,
+                labels=b_labels,
+            )
 
             loss = result.loss
 
@@ -363,14 +371,14 @@ def train_model(train_data, valid_data, device):
     print("Training complete!")
 
     # Use plot styling from seaborn.
-    sns.set(style='darkgrid')
+    sns.set(style="darkgrid")
 
     # Increase the plot size and font size.
     sns.set(font_scale=1.5)
-    plt.rcParams["figure.figsize"] = (12,6)
+    plt.rcParams["figure.figsize"] = (12, 6)
 
     # Plot the learning curve.
-    plt.plot(loss_values, 'b-o')
+    plt.plot(loss_values, "b-o")
 
     # Label the plot.
     plt.title("Training loss")
@@ -393,9 +401,9 @@ def train_model(train_data, valid_data, device):
     # add the null labels for special tokens like [SEP], [CLS], etc
     final_train_labels = add_null_labels(train_token_ids, train_labels, label_mapping)
     # convert the processed dataset to tensors
-    pt_train_token_ids, pt_train_attention_masks, pt_train_labels = convert_to_tensors(train_token_ids,
-                                                                                       train_attention_masks,
-                                                                                       final_train_labels)
+    pt_train_token_ids, pt_train_attention_masks, pt_train_labels = convert_to_tensors(
+        train_token_ids, train_attention_masks, final_train_labels
+    )
 
     # process the validation dataset
     # parse the conll format
@@ -405,9 +413,9 @@ def train_model(train_data, valid_data, device):
     # add the null labels for special tokens like [SEP], [CLS], etc
     final_valid_labels = add_null_labels(valid_token_ids, valid_labels, label_mapping)
     # convert the processed dataset to tensors
-    pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels = convert_to_tensors(valid_token_ids,
-                                                                                       valid_attention_masks,
-                                                                                       final_valid_labels)
+    pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels = convert_to_tensors(
+        valid_token_ids, valid_attention_masks, final_valid_labels
+    )
     # process the test dataset
     # parse the conll format
     test_sentences, test_labels, _ = parse_conll("test.txt")
@@ -416,16 +424,22 @@ def train_model(train_data, valid_data, device):
     # add the null labels for special tokens like [SEP], [CLS], etc
     final_test_labels = add_null_labels(test_token_ids, test_labels, label_mapping)
     # convert the processed dataset to tensors
-    pt_test_token_ids, pt_test_attention_masks, pt_test_labels = convert_to_tensors(test_token_ids,
-                                                                                    test_attention_masks,
-                                                                                    final_test_labels)
+    pt_test_token_ids, pt_test_attention_masks, pt_test_labels = convert_to_tensors(
+        test_token_ids, test_attention_masks, final_test_labels
+    )
 
     # Convert the training inputs into a TensorDataset.
-    train_dataset = TensorDataset(pt_train_token_ids, pt_train_attention_masks, pt_train_labels)
+    train_dataset = TensorDataset(
+        pt_train_token_ids, pt_train_attention_masks, pt_train_labels
+    )
     # Convert the validation inputs into a TensorDataset.
-    valid_dataset = TensorDataset(pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels)
+    valid_dataset = TensorDataset(
+        pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels
+    )
     # Convert the test inputs into a TensorDataset.
-    test_dataset = TensorDataset(pt_test_token_ids, pt_test_attention_masks, pt_test_labels)
+    test_dataset = TensorDataset(
+        pt_test_token_ids, pt_test_attention_masks, pt_test_labels
+    )
 
     # The DataLoader needs to know our batch size for training, so we specify it
     # here. For fine-tuning BERT on a specific task, the authors recommend a batch
@@ -437,20 +451,21 @@ def train_model(train_data, valid_data, device):
     train_dataloader = DataLoader(
         train_dataset,  # The training samples.
         sampler=RandomSampler(train_dataset),  # Select batches randomly
-        batch_size=batch_size  # Trains with this batch size.
+        batch_size=batch_size,  # Trains with this batch size.
     )
 
     # For validation the order doesn't matter, so we'll just read them sequentially.
     validation_dataloader = DataLoader(
         valid_dataset,  # The validation samples.
         sampler=SequentialSampler(valid_dataset),  # Pull out batches sequentially.
-        batch_size=batch_size  # Evaluate with this batch size.
+        batch_size=batch_size,  # Evaluate with this batch size.
     )
 
     # Load BertForTokenClassification
     model = BertForTokenClassification.from_pretrained(
         "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
-        num_labels=len(label_mapping) + 1,  # The number of output labels--18 for our NER dataset
+        num_labels=len(label_mapping)
+        + 1,  # The number of output labels--18 for our NER dataset
         output_attentions=False,  # Whether the model returns attentions weights.
         output_hidden_states=False,  # Whether the model returns all hidden-states.
     )
@@ -459,10 +474,9 @@ def train_model(train_data, valid_data, device):
     model.cuda()
 
     # Load the AdamW optimizer
-    optimizer = AdamW(model.parameters(),
-                      lr=5e-5,  # args.learning_rate
-                      eps=1e-8  # args.adam_epsilon
-                    )
+    optimizer = AdamW(
+        model.parameters(), lr=5e-5, eps=1e-8  # args.learning_rate  # args.adam_epsilon
+    )
 
     # Number of training epochs
     epochs = 4
@@ -471,26 +485,30 @@ def train_model(train_data, valid_data, device):
     total_steps = len(train_dataloader) * epochs
 
     # Create the learning rate scheduler.
-    scheduler = get_linear_schedule_with_warmup(optimizer,
-                                                num_warmup_steps=0,
-                                                num_training_steps=total_steps)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=0, num_training_steps=total_steps
+    )
 
     device = get_device()
 
-    train_model(train_data=train_dataloader,
-                valid_data=validation_dataloader,
-                device=device)
+    train_model(
+        train_data=train_dataloader, valid_data=validation_dataloader, device=device
+    )
 
     # Prediction on test set
     # Set the batch size.
     batch_size = 32
 
     # Create the DataLoader.
-    prediction_data = TensorDataset(pt_test_token_ids, pt_test_attention_masks, pt_test_labels)
+    prediction_data = TensorDataset(
+        pt_test_token_ids, pt_test_attention_masks, pt_test_labels
+    )
     prediction_sampler = SequentialSampler(prediction_data)
-    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+    prediction_dataloader = DataLoader(
+        prediction_data, sampler=prediction_sampler, batch_size=batch_size
+    )
 
-    print('Predicting labels for {:,} test sentences...'.format(len(pt_test_token_ids)))
+    print("Predicting labels for {:,} test sentences...".format(len(pt_test_token_ids)))
 
     # Put model in evaluation mode
     model.eval()
@@ -510,22 +528,24 @@ def train_model(train_data, valid_data, device):
         # speeding up prediction
         with torch.no_grad():
             # Forward pass, calculate logit predictions
-            result = model(b_input_ids,
-                           token_type_ids=None,
-                           attention_mask=b_input_mask,
-                           return_dict=True)
+            result = model(
+                b_input_ids,
+                token_type_ids=None,
+                attention_mask=b_input_mask,
+                return_dict=True,
+            )
 
         logits = result.logits
 
         # Move logits and labels to CPU
         logits = logits.detach().cpu().numpy()
-        label_ids = b_labels.to('cpu').numpy()
+        label_ids = b_labels.to("cpu").numpy()
 
         # Store predictions and true labels
         predictions.append(logits)
         true_labels.append(label_ids)
 
-    print('    DONE.')
+    print("    DONE.")
 
     # First, combine the results across the batches.
     all_predictions = np.concatenate(predictions, axis=0)
@@ -569,11 +589,17 @@ def train_model(train_data, valid_data, device):
             real_token_predictions.append(predicted_label_ids[i])
             real_token_labels.append(all_true_labels[i])
 
-    print("Before filtering out `null` tokens, length = {:,}".format(len(all_true_labels)))
-    print(" After filtering out `null` tokens, length = {:,}".format(len(real_token_labels)))
+    print(
+        "Before filtering out `null` tokens, length = {:,}".format(len(all_true_labels))
+    )
+    print(
+        " After filtering out `null` tokens, length = {:,}".format(
+            len(real_token_labels)
+        )
+    )
 
     # Calculate the F1 score. Because this is a multi-class problem, we have
     # to set the `average` parameter. TODO - What does `micro` do?
-    f1 = f1_score(real_token_labels, real_token_predictions, average='micro')
+    f1 = f1_score(real_token_labels, real_token_predictions, average="micro")
 
     print("F1 score: {:.2%}".format(f1))
diff --git a/ldkp_amardeep.py b/ldkp_amardeep.py
new file mode 100644
index 0000000..5ba5b61
--- /dev/null
+++ b/ldkp_amardeep.py
@@ -0,0 +1,2919 @@
+# -*- coding: utf-8 -*-
+"""long-document-kp Amardeep.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1GNzTFF75dQUrgXOiSteZ59S5tt6odWgZ
+
+# **this notebook is under devlopement don't run or change anything**
+"""
+
+# !pip install transformers
+# !pip install datasets
+# !pip install seqeval
+# !pip install flair
+
+"""# trim dataset function
+
+"""
+
+
+def trim_file(fin, fout=None, n=10000):
+    import json
+
+    if fout is None:
+        fout = fin[:-5] + str(n) + ".json"
+    with open(fin, "r") as fi:
+        with open(fout, "w") as fo:
+            for x in fi:
+                if n > 0:
+                    fo.write(x)
+                    n -= 1
+
+
+# trim_file("/content/drive/MyDrive/long_document_ke/train.json", n=5000)
+
+# trim_file("/content/drive/MyDrive/long_document_ke/text_rank_conll_kp20_proc.json", n=500)
+
+"""# Reformer for token classification"""
+
+# long_doc_kp_models.py
+# all long docu,emt modesl realted to KP
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# or enter
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for token classification.
+"""
+# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
+# comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+# logging.set_global_logging_level(logging.INFO)
+import numpy as np
+from datasets import ClassLabel, load_dataset, load_metric
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    AutoModel,
+    DataCollatorForTokenClassification,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+
+
+logger = logging.getLogger(__name__)
+
+from transformers.models.reformer.modeling_reformer import *
+from transformers import (
+    LongformerForTokenClassification,
+    # BigBirdForTokenClassification
+)
+
+
+class ReformerForTokenClassification(ReformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.reformer = ReformerModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        num_hashes=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.reformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            num_hashes=num_hashes,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss,
+                    labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels),
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        # if not return_dict:
+        output = (logits,) + outputs[2:]
+        return ((loss,) + output) if loss is not None else output
+
+
+LONG_DOC_KP_MODELS = {
+    "longformer": LongformerForTokenClassification,
+    "reformer": ReformerForTokenClassification,
+}
+
+
+"""#CRF module
+
+## crf algo utils
+"""
+
+"""
+Conditional random field
+"""
+from typing import List, Tuple, Dict, Union
+
+import torch
+
+# from allennlp.common.checks import ConfigurationError
+# import allennlp.nn.util as util
+
+VITERBI_DECODING = Tuple[List[int], float]  # a list of tags, and a viterbi score
+
+
+def allowed_transitions(
+    constraint_type: str, labels: Dict[int, str]
+) -> List[Tuple[int, int]]:
+    """
+    Given labels and a constraint type, returns the allowed transitions. It will
+    additionally include transitions for the start and end states, which are used
+    by the conditional random field.
+    # Parameters
+    constraint_type : `str`, required
+        Indicates which constraint to apply. Current choices are
+        "BIO", "IOB1", "BIOUL", and "BMES".
+    labels : `Dict[int, str]`, required
+        A mapping {label_id -> label}. Most commonly this would be the value from
+        Vocabulary.get_index_to_token_vocabulary()
+    # Returns
+    `List[Tuple[int, int]]`
+        The allowed transitions (from_label_id, to_label_id).
+    """
+    num_labels = len(labels)
+    start_tag = num_labels
+    end_tag = num_labels + 1
+    labels_with_boundaries = list(labels.items()) + [
+        (start_tag, "START"),
+        (end_tag, "END"),
+    ]
+
+    allowed = []
+    for from_label_index, from_label in labels_with_boundaries:
+        if from_label in ("START", "END"):
+            from_tag = from_label
+            from_entity = ""
+        else:
+            from_tag = from_label[0]
+            from_entity = from_label[1:]
+        for to_label_index, to_label in labels_with_boundaries:
+            if to_label in ("START", "END"):
+                to_tag = to_label
+                to_entity = ""
+            else:
+                to_tag = to_label[0]
+                to_entity = to_label[1:]
+            if is_transition_allowed(
+                constraint_type, from_tag, from_entity, to_tag, to_entity
+            ):
+                allowed.append((from_label_index, to_label_index))
+    return allowed
+
+
+def is_transition_allowed(
+    constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str
+):
+    """
+    Given a constraint type and strings `from_tag` and `to_tag` that
+    represent the origin and destination of the transition, return whether
+    the transition is allowed under the given constraint type.
+    # Parameters
+    constraint_type : `str`, required
+        Indicates which constraint to apply. Current choices are
+        "BIO", "IOB1", "BIOUL", and "BMES".
+    from_tag : `str`, required
+        The tag that the transition originates from. For example, if the
+        label is `I-PER`, the `from_tag` is `I`.
+    from_entity : `str`, required
+        The entity corresponding to the `from_tag`. For example, if the
+        label is `I-PER`, the `from_entity` is `PER`.
+    to_tag : `str`, required
+        The tag that the transition leads to. For example, if the
+        label is `I-PER`, the `to_tag` is `I`.
+    to_entity : `str`, required
+        The entity corresponding to the `to_tag`. For example, if the
+        label is `I-PER`, the `to_entity` is `PER`.
+    # Returns
+    `bool`
+        Whether the transition is allowed under the given `constraint_type`.
+    """
+
+    if to_tag == "START" or from_tag == "END":
+        # Cannot transition into START or from END
+        return False
+
+    if constraint_type == "BIOUL":
+        if from_tag == "START":
+            return to_tag in ("O", "B", "U")
+        if to_tag == "END":
+            return from_tag in ("O", "L", "U")
+        return any(
+            [
+                # O can transition to O, B-* or U-*
+                # L-x can transition to O, B-*, or U-*
+                # U-x can transition to O, B-*, or U-*
+                from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"),
+                # B-x can only transition to I-x or L-x
+                # I-x can only transition to I-x or L-x
+                from_tag in ("B", "I")
+                and to_tag in ("I", "L")
+                and from_entity == to_entity,
+            ]
+        )
+    elif constraint_type == "BIO":
+        if from_tag == "START":
+            return to_tag in ("O", "B")
+        if to_tag == "END":
+            return from_tag in ("O", "B", "I")
+        return any(
+            [
+                # Can always transition to O or B-x
+                to_tag in ("O", "B"),
+                # Can only transition to I-x from B-x or I-x
+                to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity,
+            ]
+        )
+    elif constraint_type == "IOB1":
+        if from_tag == "START":
+            return to_tag in ("O", "I")
+        if to_tag == "END":
+            return from_tag in ("O", "B", "I")
+        return any(
+            [
+                # Can always transition to O or I-x
+                to_tag in ("O", "I"),
+                # Can only transition to B-x from B-x or I-x, where
+                # x is the same tag.
+                to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity,
+            ]
+        )
+    elif constraint_type == "BMES":
+        if from_tag == "START":
+            return to_tag in ("B", "S")
+        if to_tag == "END":
+            return from_tag in ("E", "S")
+        return any(
+            [
+                # Can only transition to B or S from E or S.
+                to_tag in ("B", "S") and from_tag in ("E", "S"),
+                # Can only transition to M-x from B-x, where
+                # x is the same tag.
+                to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity,
+                # Can only transition to E-x from B-x or M-x, where
+                # x is the same tag.
+                to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity,
+            ]
+        )
+    else:
+        print("error in constrint type")
+
+
+def logsumexp(
+    tensor: torch.Tensor, dim: int = -1, keepdim: bool = False
+) -> torch.Tensor:
+    """
+    A numerically stable computation of logsumexp. This is mathematically equivalent to
+    `tensor.exp().sum(dim, keep=keepdim).log()`.  This function is typically used for summing log
+    probabilities.
+    # Parameters
+    tensor : `torch.FloatTensor`, required.
+        A tensor of arbitrary size.
+    dim : `int`, optional (default = `-1`)
+        The dimension of the tensor to apply the logsumexp to.
+    keepdim: `bool`, optional (default = `False`)
+        Whether to retain a dimension of size one at the dimension we reduce over.
+    """
+    max_score, _ = tensor.max(dim, keepdim=keepdim)
+    if keepdim:
+        stable_vec = tensor - max_score
+    else:
+        stable_vec = tensor - max_score.unsqueeze(dim)
+    return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log()
+
+
+"""## vertibe decode"""
+
+
+def viterbi_decode(
+    tag_sequence: torch.Tensor,
+    transition_matrix: torch.Tensor,
+    tag_observations: Optional[List[int]] = None,
+    allowed_start_transitions: torch.Tensor = None,
+    allowed_end_transitions: torch.Tensor = None,
+    top_k: int = None,
+):
+    """
+    Perform Viterbi decoding in log space over a sequence given a transition matrix
+    specifying pairwise (transition) potentials between tags and a matrix of shape
+    (sequence_length, num_tags) specifying unary potentials for possible tags per
+    timestep.
+    # Parameters
+    tag_sequence : `torch.Tensor`, required.
+        A tensor of shape (sequence_length, num_tags) representing scores for
+        a set of tags over a given sequence.
+    transition_matrix : `torch.Tensor`, required.
+        A tensor of shape (num_tags, num_tags) representing the binary potentials
+        for transitioning between a given pair of tags.
+    tag_observations : `Optional[List[int]]`, optional, (default = `None`)
+        A list of length `sequence_length` containing the class ids of observed
+        elements in the sequence, with unobserved elements being set to -1. Note that
+        it is possible to provide evidence which results in degenerate labelings if
+        the sequences of tags you provide as evidence cannot transition between each
+        other, or those transitions are extremely unlikely. In this situation we log a
+        warning, but the responsibility for providing self-consistent evidence ultimately
+        lies with the user.
+    allowed_start_transitions : `torch.Tensor`, optional, (default = `None`)
+        An optional tensor of shape (num_tags,) describing which tags the START token
+        may transition *to*. If provided, additional transition constraints will be used for
+        determining the start element of the sequence.
+    allowed_end_transitions : `torch.Tensor`, optional, (default = `None`)
+        An optional tensor of shape (num_tags,) describing which tags may transition *to* the
+        end tag. If provided, additional transition constraints will be used for determining
+        the end element of the sequence.
+    top_k : `int`, optional, (default = `None`)
+        Optional integer specifying how many of the top paths to return. For top_k>=1, returns
+        a tuple of two lists: top_k_paths, top_k_scores, For top_k==None, returns a flattened
+        tuple with just the top path and its score (not in lists, for backwards compatibility).
+    # Returns
+    viterbi_path : `List[int]`
+        The tag indices of the maximum likelihood tag sequence.
+    viterbi_score : `torch.Tensor`
+        The score of the viterbi path.
+    """
+    if top_k is None:
+        top_k = 1
+        flatten_output = True
+    elif top_k >= 1:
+        flatten_output = False
+    else:
+        raise ValueError(
+            f"top_k must be either None or an integer >=1. Instead received {top_k}"
+        )
+
+    sequence_length, num_tags = list(tag_sequence.size())
+
+    has_start_end_restrictions = (
+        allowed_end_transitions is not None or allowed_start_transitions is not None
+    )
+
+    if has_start_end_restrictions:
+
+        if allowed_end_transitions is None:
+            allowed_end_transitions = torch.zeros(num_tags)
+        if allowed_start_transitions is None:
+            allowed_start_transitions = torch.zeros(num_tags)
+
+        num_tags = num_tags + 2
+        new_transition_matrix = torch.zeros(num_tags, num_tags)
+        new_transition_matrix[:-2, :-2] = transition_matrix
+
+        # Start and end transitions are fully defined, but cannot transition between each other.
+
+        allowed_start_transitions = torch.cat(
+            [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])]
+        )
+        allowed_end_transitions = torch.cat(
+            [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])]
+        )
+
+        # First define how we may transition FROM the start and end tags.
+        new_transition_matrix[-2, :] = allowed_start_transitions
+        # We cannot transition from the end tag to any tag.
+        new_transition_matrix[-1, :] = -math.inf
+
+        new_transition_matrix[:, -1] = allowed_end_transitions
+        # We cannot transition to the start tag from any tag.
+        new_transition_matrix[:, -2] = -math.inf
+
+        transition_matrix = new_transition_matrix
+
+    if tag_observations:
+        if len(tag_observations) != sequence_length:
+            raise ConfigurationError(
+                "Observations were provided, but they were not the same length "
+                "as the sequence. Found sequence of length: {} and evidence: {}".format(
+                    sequence_length, tag_observations
+                )
+            )
+    else:
+        tag_observations = [-1 for _ in range(sequence_length)]
+
+    if has_start_end_restrictions:
+        tag_observations = [num_tags - 2] + tag_observations + [num_tags - 1]
+        zero_sentinel = torch.zeros(1, num_tags)
+        extra_tags_sentinel = torch.ones(sequence_length, 2) * -math.inf
+        tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1)
+        tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0)
+        sequence_length = tag_sequence.size(0)
+
+    path_scores = []
+    path_indices = []
+
+    if tag_observations[0] != -1:
+        one_hot = torch.zeros(num_tags)
+        one_hot[tag_observations[0]] = 100000.0
+        path_scores.append(one_hot.unsqueeze(0))
+    else:
+        path_scores.append(tag_sequence[0, :].unsqueeze(0))
+
+    # Evaluate the scores for all possible paths.
+    for timestep in range(1, sequence_length):
+        # Add pairwise potentials to current scores.
+        summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix
+        summed_potentials = summed_potentials.view(-1, num_tags)
+
+        # Best pairwise potential path score from the previous timestep.
+        max_k = min(summed_potentials.size()[0], top_k)
+        scores, paths = torch.topk(summed_potentials, k=max_k, dim=0)
+
+        # If we have an observation for this timestep, use it
+        # instead of the distribution over tags.
+        observation = tag_observations[timestep]
+        # Warn the user if they have passed
+        # invalid/extremely unlikely evidence.
+        if tag_observations[timestep - 1] != -1 and observation != -1:
+            if transition_matrix[tag_observations[timestep - 1], observation] < -10000:
+                logger.warning(
+                    "The pairwise potential between tags you have passed as "
+                    "observations is extremely unlikely. Double check your evidence "
+                    "or transition potentials!"
+                )
+        if observation != -1:
+            one_hot = torch.zeros(num_tags)
+            one_hot[observation] = 100000.0
+            path_scores.append(one_hot.unsqueeze(0))
+        else:
+            path_scores.append(tag_sequence[timestep, :] + scores)
+        path_indices.append(paths.squeeze())
+
+    # Construct the most likely sequence backwards.
+    path_scores_v = path_scores[-1].view(-1)
+    max_k = min(path_scores_v.size()[0], top_k)
+    viterbi_scores, best_paths = torch.topk(path_scores_v, k=max_k, dim=0)
+    viterbi_paths = []
+    for i in range(max_k):
+        viterbi_path = [best_paths[i]]
+        for backward_timestep in reversed(path_indices):
+            viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]]))
+        # Reverse the backward path.
+        viterbi_path.reverse()
+
+        if has_start_end_restrictions:
+            viterbi_path = viterbi_path[1:-1]
+
+        # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo.
+        viterbi_path = [j % num_tags for j in viterbi_path]
+        viterbi_paths.append(viterbi_path)
+
+    if flatten_output:
+        return viterbi_paths[0], viterbi_scores[0]
+
+    return viterbi_paths, viterbi_scores
+
+
+"""## crf algorithm"""
+
+
+class ConditionalRandomField(torch.nn.Module):
+    """
+    This module uses the "forward-backward" algorithm to compute
+    the log-likelihood of its inputs assuming a conditional random field model.
+    See, e.g. http://www.cs.columbia.edu/~mcollins/fb.pdf
+    # Parameters
+    num_tags : `int`, required
+        The number of tags.
+    constraints : `List[Tuple[int, int]]`, optional (default = `None`)
+        An optional list of allowed transitions (from_tag_id, to_tag_id).
+        These are applied to `viterbi_tags()` but do not affect `forward()`.
+        These should be derived from `allowed_transitions` so that the
+        start and end transitions are handled correctly for your tag type.
+    include_start_end_transitions : `bool`, optional (default = `True`)
+        Whether to include the start and end transition parameters.
+    """
+
+    # def __init__(
+    #     self,
+    #     num_tags: int,
+    #     constraints: List[Tuple[int, int]] = None,
+    #     include_start_end_transitions: bool = True,
+    # ) -> None:
+    def __init__(
+        self,
+        num_tags: int,
+        label_encoding,
+        idx2tag,
+        include_start_end_transitions: bool = True,
+    ) -> None:
+        super().__init__()
+        self.num_tags = num_tags
+        constraints = allowed_transitions(label_encoding, idx2tag)
+        # transitions[i, j] is the logit for transitioning from state i to state j.
+        self.transitions = torch.nn.Parameter(torch.Tensor(num_tags, num_tags))
+
+        # _constraint_mask indicates valid transitions (based on supplied constraints).
+        # Include special start of sequence (num_tags + 1) and end of sequence tags (num_tags + 2)
+        if constraints is None:
+            # All transitions are valid.
+            constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(1.0)
+        else:
+            constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(0.0)
+            for i, j in constraints:
+                constraint_mask[i, j] = 1.0
+
+        self._constraint_mask = torch.nn.Parameter(constraint_mask, requires_grad=False)
+
+        # Also need logits for transitioning from "start" state and to "end" state.
+        self.include_start_end_transitions = include_start_end_transitions
+        if include_start_end_transitions:
+            self.start_transitions = torch.nn.Parameter(torch.Tensor(num_tags))
+            self.end_transitions = torch.nn.Parameter(torch.Tensor(num_tags))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.xavier_normal_(self.transitions)
+        if self.include_start_end_transitions:
+            torch.nn.init.normal_(self.start_transitions)
+            torch.nn.init.normal_(self.end_transitions)
+
+    def _input_likelihood(
+        self, logits: torch.Tensor, mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        """
+        Computes the (batch_size,) denominator term for the log-likelihood, which is the
+        sum of the likelihoods across all possible state sequences.
+        """
+        batch_size, sequence_length, num_tags = logits.size()
+
+        # Transpose batch size and sequence dimensions
+        mask = mask.transpose(0, 1).contiguous()
+        logits = logits.transpose(0, 1).contiguous()
+
+        # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the
+        # transitions to the initial states and the logits for the first timestep.
+        if self.include_start_end_transitions:
+            alpha = self.start_transitions.view(1, num_tags) + logits[0]
+        else:
+            alpha = logits[0]
+
+        # For each i we compute logits for the transitions from timestep i-1 to timestep i.
+        # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are
+        # (instance, current_tag, next_tag)
+        for i in range(1, sequence_length):
+            # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis.
+            emit_scores = logits[i].view(batch_size, 1, num_tags)
+            # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis.
+            transition_scores = self.transitions.view(1, num_tags, num_tags)
+            # Alpha is for the current_tag, so we broadcast along the next_tag axis.
+            broadcast_alpha = alpha.view(batch_size, num_tags, 1)
+
+            # Add all the scores together and logexp over the current_tag axis.
+            inner = broadcast_alpha + emit_scores + transition_scores
+
+            # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension
+            # of `inner`. Otherwise (mask == False) we want to retain the previous alpha.
+            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (
+                ~mask[i]
+            ).view(batch_size, 1)
+
+        # Every sequence needs to end with a transition to the stop_tag.
+        if self.include_start_end_transitions:
+            stops = alpha + self.end_transitions.view(1, num_tags)
+        else:
+            stops = alpha
+
+        # Finally we log_sum_exp along the num_tags dim, result is (batch_size,)
+        return logsumexp(stops)
+
+    def _joint_likelihood(
+        self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        """
+        Computes the numerator term for the log-likelihood, which is just score(inputs, tags)
+        """
+        batch_size, sequence_length, _ = logits.data.shape
+
+        # Transpose batch size and sequence dimensions:
+        logits = logits.transpose(0, 1).contiguous()
+        mask = mask.transpose(0, 1).contiguous()
+        tags = tags.transpose(0, 1).contiguous()
+
+        # Start with the transition scores from start_tag to the first tag in each input
+        if self.include_start_end_transitions:
+            score = self.start_transitions.index_select(0, tags[0])
+        else:
+            score = 0.0
+
+        # Add up the scores for the observed transitions and all the inputs but the last
+        # print(mask.shape, tags.shape, logits.shape, sequence_length)
+        for i in range(sequence_length - 1):
+            # Each is shape (batch_size,)
+            current_tag, next_tag = tags[i], tags[i + 1]
+            # print(current_tag, next_tag)
+            # print("tags printiiinggggg")
+            # print(current_tag, next_tag)
+            # The scores for transitioning from current_tag to next_tag
+            transition_score = self.transitions[current_tag.view(-1), next_tag.view(-1)]
+
+            # The score for using current_tag
+            emit_score = logits[i].gather(1, current_tag.view(batch_size, 1)).squeeze(1)
+            # emit_score= 0
+            # Include transition score if next element is unmasked,
+            # input_score if this element is unmasked.
+            score = score + transition_score * mask[i + 1] + emit_score * mask[i]
+
+        # Transition from last state to "stop" state. To start with, we need to find the last tag
+        # for each instance.
+        last_tag_index = mask.sum(0).long() - 1
+        last_tags = tags.gather(0, last_tag_index.view(1, batch_size)).squeeze(0)
+
+        # Compute score of transitioning to `stop_tag` from each "last tag".
+        if self.include_start_end_transitions:
+            last_transition_score = self.end_transitions.index_select(0, last_tags)
+        else:
+            last_transition_score = 0.0
+
+        # Add the last input if it's not masked.
+        last_inputs = logits[-1]  # (batch_size, num_tags)
+        last_input_score = last_inputs.gather(
+            1, last_tags.view(-1, 1)
+        )  # (batch_size, 1)
+        last_input_score = last_input_score.squeeze()  # (batch_size,)
+
+        score = score + last_transition_score + last_input_score * mask[-1]
+
+        return score
+
+    def forward(
+        self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None
+    ) -> torch.Tensor:
+        """
+        Computes the log likelihood.
+        """
+        # mask[tags==-100]=0
+        if mask is None:
+            mask = torch.ones(*tags.size(), dtype=torch.bool)
+        else:
+            # The code below fails in weird ways if this isn't a bool tensor, so we make sure.
+            mask = mask.to(torch.bool)
+        # print("forward",inputs.shape, tags.shape, mask.shape)
+
+        log_denominator = self._input_likelihood(inputs, mask)
+        # temp_tags= tags
+        # tags[tags==-100]=2
+        # print(tags[0])
+        log_numerator = self._joint_likelihood(inputs, tags, mask)
+        # tags[mask==0]=-100
+        return torch.sum(log_numerator - log_denominator)
+
+    def viterbi_tags(
+        self, logits: torch.Tensor, mask: torch.BoolTensor = None, top_k: int = None
+    ) -> Union[List[VITERBI_DECODING], List[List[VITERBI_DECODING]]]:
+        """
+        Uses viterbi algorithm to find most likely tags for the given inputs.
+        If constraints are applied, disallows all other transitions.
+        Returns a list of results, of the same size as the batch (one result per batch member)
+        Each result is a List of length top_k, containing the top K viterbi decodings
+        Each decoding is a tuple  (tag_sequence, viterbi_score)
+        For backwards compatibility, if top_k is None, then instead returns a flat list of
+        tag sequences (the top tag sequence for each batch item).
+        """
+        if mask is None:
+            mask = torch.ones(*logits.shape[:2], dtype=torch.bool, device=logits.device)
+
+        if top_k is None:
+            top_k = 1
+            flatten_output = True
+        else:
+            flatten_output = False
+
+        _, max_seq_length, num_tags = logits.size()
+
+        # Get the tensors out of the variables
+        logits, mask = logits.data, mask.data
+
+        # Augment transitions matrix with start and end transitions
+        start_tag = num_tags
+        end_tag = num_tags + 1
+        transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0)
+
+        # Apply transition constraints
+        constrained_transitions = self.transitions * self._constraint_mask[
+            :num_tags, :num_tags
+        ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags])
+        transitions[:num_tags, :num_tags] = constrained_transitions.data
+
+        if self.include_start_end_transitions:
+            transitions[
+                start_tag, :num_tags
+            ] = self.start_transitions.detach() * self._constraint_mask[
+                start_tag, :num_tags
+            ].data + -10000.0 * (
+                1 - self._constraint_mask[start_tag, :num_tags].detach()
+            )
+            transitions[
+                :num_tags, end_tag
+            ] = self.end_transitions.detach() * self._constraint_mask[
+                :num_tags, end_tag
+            ].data + -10000.0 * (
+                1 - self._constraint_mask[:num_tags, end_tag].detach()
+            )
+        else:
+            transitions[start_tag, :num_tags] = -10000.0 * (
+                1 - self._constraint_mask[start_tag, :num_tags].detach()
+            )
+            transitions[:num_tags, end_tag] = -10000.0 * (
+                1 - self._constraint_mask[:num_tags, end_tag].detach()
+            )
+
+        best_paths = []
+        # Pad the max sequence length by 2 to account for start_tag + end_tag.
+        tag_sequence = torch.Tensor(max_seq_length + 2, num_tags + 2)
+
+        for prediction, prediction_mask in zip(logits, mask):
+            mask_indices = prediction_mask.nonzero(as_tuple=False).squeeze()
+            masked_prediction = torch.index_select(prediction, 0, mask_indices)
+            sequence_length = masked_prediction.shape[0]
+
+            # Start with everything totally unlikely
+            tag_sequence.fill_(-10000.0)
+            # At timestep 0 we must have the START_TAG
+            tag_sequence[0, start_tag] = 0.0
+            # At steps 1, ..., sequence_length we just use the incoming prediction
+            tag_sequence[1 : (sequence_length + 1), :num_tags] = masked_prediction
+            # And at the last timestep we must have the END_TAG
+            tag_sequence[sequence_length + 1, end_tag] = 0.0
+
+            # We pass the tags and the transitions to `viterbi_decode`.
+            viterbi_paths, viterbi_scores = viterbi_decode(
+                tag_sequence=tag_sequence[: (sequence_length + 2)],
+                transition_matrix=transitions,
+                top_k=top_k,
+            )
+            top_k_paths = []
+            for viterbi_path, viterbi_score in zip(viterbi_paths, viterbi_scores):
+                # Get rid of START and END sentinels and append.
+                viterbi_path = viterbi_path[1:-1]
+                top_k_paths.append((viterbi_path, viterbi_score.item()))
+            best_paths.append(top_k_paths)
+
+        if flatten_output:
+            return [top_k_paths[0] for top_k_paths in best_paths]
+
+        return best_paths
+
+
+"""# CRF Models for Token Classification
+
+## Bert CRF
+"""
+
+from transformers import (
+    AutoModelForPreTraining,
+    AutoModel,
+    BertModel,
+    BertPreTrainedModel,
+    LongformerModel,
+    PreTrainedModel,
+)
+from transformers.modeling_outputs import TokenClassifierOutput
+import collections
+
+
+class BERT_CRFforTokenClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # self.crf= nn.Linear(config.num_labels,1)
+        # self.crf= ConditionalRandomField(self.num_labels)
+        self.crf = ConditionalRandomField(
+            self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"}
+        )
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = -self.crf(logits, labels, attention_mask)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        # print(self.crf.transitions)
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def freeze_till_clf(self):
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        for param in self.dropout.parameters():
+            param.requires_grad = False
+        for param in self.classifier.parameters():
+            param.requires_grad = False
+
+    def freeze_encoder_layer(self):
+        for param in self.bert.parameters():
+            param.requires_grad = False
+
+            # return ((loss,) + output) if loss is not None else output
+
+
+"""## longformer CRF """
+
+from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel
+
+
+class Longformer_CRFforTokenClassification(LongformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.longformer = LongformerModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # self.crf= nn.Linear(config.num_labels,1)
+        self.crf = ConditionalRandomField(self.num_labels)
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.longformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = -self.crf(logits, labels, attention_mask)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def freeze_till_clf(self):
+        for param in self.longformer.parameters():
+            param.requires_grad = False
+        for param in self.dropout.parameters():
+            param.requires_grad = False
+        for param in self.classifier.parameters():
+            param.requires_grad = False
+
+    # def freeze_encoder_layer(self):
+    #   for param in self.longformer.parameters():
+    #     param.requires_grad = Falsefreeze_till_clfr=True):
+    #     super().__init__(config)
+    #     self.config = config
+
+    #     self.embeddings = BertEmbeddings(config)
+    #     self.encoder = BertEncoder(config)
+
+    #     self.pooler = BertPooler(config) if add_pooling_layer else None
+
+    #     self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        guide_embed=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            (
+                encoder_batch_size,
+                encoder_sequence_length,
+                _,
+            ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask
+            )
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        # assert guide_embed is not None
+        if guide_embed is not None:
+            embedding_output[:, 0, :] = guide_embed
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+"""##modi bert token clf
+
+"""
+
+
+class ModiBertForTokenClassification(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = ModiBertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        guide_embed=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            guide_embed=guide_embed,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # loss_fct = CrossEntropyLoss(weight=torch.tensor([0.4,0.35,0.25], device= labels.device))
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss,
+                    labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels),
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+"""#Modi Longformer
+
+## modi base long
+"""
+
+from transformers.models.longformer.modeling_longformer import *
+
+
+class ModiLongformerModel(LongformerPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        if isinstance(config.attention_window, int):
+            assert (
+                config.attention_window % 2 == 0
+            ), "`config.attention_window` has to be an even value"
+            assert (
+                config.attention_window > 0
+            ), "`config.attention_window` has to be positive"
+            config.attention_window = [
+                config.attention_window
+            ] * config.num_hidden_layers  # one value per layer
+        else:
+            assert len(config.attention_window) == config.num_hidden_layers, (
+                "`len(config.attention_window)` should equal `config.num_hidden_layers`. "
+                f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
+            )
+
+        self.embeddings = LongformerEmbeddings(config)
+        self.encoder = LongformerEncoder(config)
+        self.pooler = LongformerPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def _pad_to_window_size(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        token_type_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        pad_token_id: int,
+    ):
+        """A helper function to pad tokens and mask to work with implementation of Longformer self-attention."""
+        # padding
+        attention_window = (
+            self.config.attention_window
+            if isinstance(self.config.attention_window, int)
+            else max(self.config.attention_window)
+        )
+
+        assert (
+            attention_window % 2 == 0
+        ), f"`attention_window` should be an even value. Given {attention_window}"
+        input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
+        batch_size, seq_len = input_shape[:2]
+
+        padding_len = (attention_window - seq_len % attention_window) % attention_window
+        if padding_len > 0:
+            logger.info(
+                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
+                f"`config.attention_window`: {attention_window}"
+            )
+            if input_ids is not None:
+                input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id)
+            if position_ids is not None:
+                # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings
+                position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id)
+            if inputs_embeds is not None:
+                input_ids_padding = inputs_embeds.new_full(
+                    (batch_size, padding_len),
+                    self.config.pad_token_id,
+                    dtype=torch.long,
+                )
+                inputs_embeds_padding = self.embeddings(input_ids_padding)
+                inputs_embeds = torch.cat(
+                    [inputs_embeds, inputs_embeds_padding], dim=-2
+                )
+
+            attention_mask = F.pad(
+                attention_mask, (0, padding_len), value=False
+            )  # no attention on the padding tokens
+            token_type_ids = F.pad(
+                token_type_ids, (0, padding_len), value=0
+            )  # pad with token_type_id = 0
+
+        return (
+            padding_len,
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            inputs_embeds,
+        )
+
+    def _merge_to_attention_mask(
+        self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor
+    ):
+        # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
+        # (global_attention_mask + 1) => 1 for local attention, 2 for global attention
+        # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
+        if attention_mask is not None:
+            attention_mask = attention_mask * (global_attention_mask + 1)
+        else:
+            # simply use `global_attention_mask` as `attention_mask`
+            # if no `attention_mask` is given
+            attention_mask = global_attention_mask + 1
+        return attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        token_type_ids=None,
+        guide_embed=None,
+        position_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # merge `global_attention_mask` and `attention_mask`
+        if global_attention_mask is not None:
+            attention_mask = self._merge_to_attention_mask(
+                attention_mask, global_attention_mask
+            )
+
+        (
+            padding_len,
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            inputs_embeds,
+        ) = self._pad_to_window_size(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            pad_token_id=self.config.pad_token_id,
+        )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device
+        )[:, 0, 0, :]
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        if guide_embed is not None:
+            embedding_output[:, 0, :] = guide_embed
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+
+        # undo padding
+        if padding_len > 0:
+            # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1)
+            sequence_output = sequence_output[:, :-padding_len]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return LongformerBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            global_attentions=encoder_outputs.global_attentions,
+        )
+
+
+"""## modi long for token"""
+
+
+class ModiLongformerForTokenClassification(LongformerPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.longformer = ModiLongformerModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        global_attention_mask=None,
+        head_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        guide_embed=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.longformer(
+            input_ids,
+            attention_mask=attention_mask,
+            global_attention_mask=global_attention_mask,
+            head_mask=head_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # loss_fct = CrossEntropyLoss(weight=torch.tensor([0.4,0.35,0.25],device= labels.device))
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss,
+                    labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels),
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return LongformerTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            global_attentions=outputs.global_attentions,
+        )
+
+
+"""#CRF trainer"""
+
+from transformers.trainer import *
+from transformers import (
+    Trainer,
+    set_seed,
+)
+
+# from Trainer import *
+from transformers.trainer_utils import PredictionOutput
+from torch import nn
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Dataset
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+
+class CRF_Trainer(Trainer):
+    def prediction_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> PredictionOutput:
+        """
+        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        if not isinstance(dataloader.dataset, collections.abc.Sized):
+            raise ValueError("dataset must implement __len__")
+        prediction_loss_only = (
+            prediction_loss_only
+            if prediction_loss_only is not None
+            else self.args.prediction_loss_only
+        )
+
+        if self.args.deepspeed and not self.args.do_train:
+            # no harm, but flagging to the user that deepspeed config is ignored for eval
+            # flagging only for when --do_train wasn't passed as only then it's redundant
+            logger.info(
+                "Detected the deepspeed argument but it will not be used for evaluation"
+            )
+
+        model = self._wrap_model(self.model, training=False)
+
+        # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while
+        # ``train`` is running, half it first and then put on device
+        if not self.is_in_train and self.args.fp16_full_eval:
+            model = model.half().to(self.args.device)
+
+        batch_size = dataloader.batch_size
+        num_examples = self.num_examples(dataloader)
+        logger.info(f"***** Running {description} *****")
+        logger.info(f"  Num examples = {num_examples}")
+        logger.info(f"  Batch size = {batch_size}")
+        losses_host: torch.Tensor = None
+        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
+
+        world_size = max(1, self.args.world_size)
+
+        eval_losses_gatherer = DistributedTensorGatherer(
+            world_size, num_examples, make_multiple_of=batch_size
+        )
+        if not prediction_loss_only:
+            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
+            # a batch size to the sampler)
+            make_multiple_of = None
+            if hasattr(dataloader, "sampler") and isinstance(
+                dataloader.sampler, SequentialDistributedSampler
+            ):
+                make_multiple_of = dataloader.sampler.batch_size
+            preds_gatherer = DistributedTensorGatherer(
+                world_size, num_examples, make_multiple_of=make_multiple_of
+            )
+            labels_gatherer = DistributedTensorGatherer(
+                world_size, num_examples, make_multiple_of=make_multiple_of
+            )
+        if self.args.past_index >= 0:
+            self._past = None
+        model.eval()
+
+        if is_torch_tpu_available():
+            dataloader = pl.ParallelLoader(
+                dataloader, [self.args.device]
+            ).per_device_loader(self.args.device)
+
+        self.callback_handler.eval_dataloader = dataloader
+
+        for step, inputs in enumerate(dataloader):
+
+            loss, logits, labels = self.prediction_step(
+                model, inputs, prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+            best_path = self.eval_step(model, logits, inputs["attention_mask"])
+            # best_path= self.eval_step(model, logits)
+            # print(len(best_path), best_path[0])
+            # logits= torch.zeros()
+
+            best_path = [x for x, _ in best_path]
+            # print(best_path)
+            # seq_len= labels.shape[1]
+            logits *= 0
+            for i, path in enumerate(best_path):
+                # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1])
+                # print(len(x))
+                for j, tag in enumerate(path):
+                    logits[i, j, int(tag)] = 1
+                    # print(inputs['attention_mask'][i,j], labels[i,j])
+
+            # logits= torch.tensor(data=best_path, dtype= labels.dtype, device= labels.device)
+            # if(logits.shape!=labels.shape):
+            #   print(logits.shape,labels.shape)
+            # assert logits.shape==labels.shape
+            if loss is not None:
+                losses = loss.repeat(batch_size)
+                losses_host = (
+                    losses
+                    if losses_host is None
+                    else torch.cat((losses_host, losses), dim=0)
+                )
+            if logits is not None:
+                preds_host = (
+                    logits
+                    if preds_host is None
+                    else nested_concat(preds_host, logits, padding_index=-100)
+                )
+            if labels is not None:
+                labels_host = (
+                    labels
+                    if labels_host is None
+                    else nested_concat(labels_host, labels, padding_index=-100)
+                )
+            self.control = self.callback_handler.on_prediction_step(
+                self.args, self.state, self.control
+            )
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if (
+                self.args.eval_accumulation_steps is not None
+                and (step + 1) % self.args.eval_accumulation_steps == 0
+            ):
+                eval_losses_gatherer.add_arrays(
+                    self._gather_and_numpify(losses_host, "eval_losses")
+                )
+                if not prediction_loss_only:
+                    preds_gatherer.add_arrays(
+                        self._gather_and_numpify(preds_host, "eval_preds")
+                    )
+                    labels_gatherer.add_arrays(
+                        self._gather_and_numpify(labels_host, "eval_label_ids")
+                    )
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, labels_host = None, None, None
+
+        if self.args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of the evaluation loop
+            delattr(self, "_past")
+
+        # Gather all remaining tensors and put them back on the CPU
+        eval_losses_gatherer.add_arrays(
+            self._gather_and_numpify(losses_host, "eval_losses")
+        )
+        if not prediction_loss_only:
+            preds_gatherer.add_arrays(
+                self._gather_and_numpify(preds_host, "eval_preds")
+            )
+            labels_gatherer.add_arrays(
+                self._gather_and_numpify(labels_host, "eval_label_ids")
+            )
+
+        eval_loss = eval_losses_gatherer.finalize()
+        preds = preds_gatherer.finalize() if not prediction_loss_only else None
+        label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
+
+        if (
+            self.compute_metrics is not None
+            and preds is not None
+            and label_ids is not None
+        ):
+            metrics = self.compute_metrics(
+                EvalPrediction(predictions=preds, label_ids=label_ids)
+            )
+        else:
+            metrics = {}
+
+        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
+        metrics = denumpify_detensorize(metrics)
+
+        if eval_loss is not None:
+            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
+
+    def eval_step(self, model: nn.Module, logits, mask=None, top_k=None):
+        with torch.no_grad():
+            output = model.crf.viterbi_tags(logits, mask, top_k)
+
+        return output
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+
+        Subclass and override for custom behavior.
+        """
+        # if self.label_smoother is not None and "labels" in inputs:
+        #     labels = inputs.pop("labels")
+        # else:
+        labels = None
+        # print(model)
+        # assert "labels" in inputs
+        # print(type(inputs),inputs)
+        outputs = model(**inputs)
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        if labels is not None:
+            loss = self.label_smoother(outputs, labels)
+        else:
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            # print(outputs)
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+        # print("loss is ", loss)
+        return (loss, outputs) if return_outputs else loss
+
+
+[4, 5] + [5, 6, 7]
+
+"""# KP ectraction main code
+
+## model and data argument
+"""
+
+# run_kpe.py
+# all long docu,emt modesl realted to KP
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for token classification.
+"""
+# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
+# comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from datasets import ClassLabel, load_dataset, load_metric
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    DataCollatorForTokenClassification,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+    BertForTokenClassification,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+
+
+logger = logging.getLogger(__name__)
+# from models.long_doc_kp_models import LONG_DOC_KP_MODELS
+
+# KPE_MODELS_DICT={
+#     'others': AutoModelForTokenClassification,
+#     "longformer":
+#     'reformer' :
+#     'crf_longformer' :
+#     'crf_bert': BERT_CRFforTokenClassification
+# }
+
+CRF_MODEL_DICT = {
+    "bert": BERT_CRFforTokenClassification,
+    "longformer": Longformer_CRFforTokenClassification,
+}
+TOKEN_MODEL_DICT = {
+    "bert": BertForTokenClassification,
+    "longformer": LongformerForTokenClassification,
+    "reformer": ReformerForTokenClassification,
+    # 'bigbird':BigBirdForTokenClassification
+}
+GUIDED_MODEL_DICT = {
+    "bert": ModiBertForTokenClassification,
+    "longformer": ModiLongformerForTokenClassification,
+}
+MODEL_DICT = {
+    "crf": CRF_MODEL_DICT,
+    "simple": TOKEN_MODEL_DICT,
+    "guided": GUIDED_MODEL_DICT,
+}
+
+# KPE_MODELS_DICT = KPE_MODELS_DICT | LONG_DOC_KP_MODELS
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_family_name: str = field(
+        metadata={
+            "help": "name of the family of model, bert, longformer, reformer etc."
+        }
+    )
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        }
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained config name or path if not the same as model_name"
+        },
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name"
+        },
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={
+            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
+        },
+    )
+    use_CRF: bool = field(
+        default=False,
+        metadata={"help": "wether to use CRF on top of the classifier"},
+    )
+    use_BiLSTM: bool = field(
+        default=False,
+        metadata={"help": "use BiLSTM in sequence classification"},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: Optional[str] = field(
+        default="simple", metadata={"help": "The name of the task simple, crf"}
+    )
+
+    train_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "The input training data file (a csv or JSON file)."},
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to predict on (a csv or JSON file)."
+        },
+    )
+    return_entity_level_metrics: bool = field(
+        default=False, metadata={"help": "calculate entity level metric"}
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    label_all_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
+            "one (in which case the other tokens will have a padding index)."
+        },
+    )
+    return_entity_level_metrics: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to return all the entity levels during evaluation or just the overall ones."
+        },
+    )
+    dataset_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the dataset to use (via the datasets library)."},
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the dataset to use (via the datasets library)."
+        },
+    )
+    cache_file_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+        ):
+            raise ValueError(
+                "Need either a dataset name or a training/validation file."
+            )
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in [
+                    "csv",
+                    "json",
+                ], "`validation_file` should be a csv or a json file."
+        self.task_name = self.task_name.lower()
+
+
+# def main():
+
+"""## main trainer function"""
+
+TRAINER_DICT = {"crf": CRF_Trainer, "simple": Trainer, "guided": Trainer}
+
+
+def main_run_kpe(model_args, data_args, training_args):
+
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+    #     # If we pass only one argument to the script and it's the path to a json file,
+    #     # let's parse it to get our arguments.
+    #     model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    # else:
+    #     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if (
+        os.path.isdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(
+        logging.INFO if is_main_process(training_args.local_rank) else logging.INFO
+    )
+    # logger.set_global_logging_level(logging.INFO)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    ## get dataset in here
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        datasets = load_dataset(
+            extension, data_files=data_files
+        )  ##CR get dataset in here
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+        features = datasets["train"].features
+    else:
+        column_names = datasets["validation"].column_names
+        features = datasets["validation"].features
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    label_column_name = "BIO_tags" if "BIO_tags" in column_names else column_names[1]
+
+    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
+    # unique labels.
+    def get_label_list(labels):
+        unique_labels = set()
+        for label in labels:
+            unique_labels = unique_labels | set(label)
+        label_list = list(unique_labels)
+        label_list.sort()
+        return label_list
+
+    if isinstance(features[label_column_name].feature, ClassLabel):
+        label_list = features[label_column_name].feature.names
+        # No need to convert the labels since they are already ints.
+        label_to_id = {i: i for i in range(len(label_list))}
+    else:
+        label_list = get_label_list(
+            datasets["train"][label_column_name]
+            if training_args.do_train
+            else datasets["validation"][label_column_name]
+        )
+        label_to_id = {l: i for i, l in enumerate(label_list)}
+    num_labels = len(label_list)
+    print(label_to_id)
+    id2tag = {}
+    for k in label_to_id.keys():
+        id2tag[label_to_id[k]] = k
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name
+        if model_args.config_name
+        else model_args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=model_args.cache_dir,
+    )
+    config.use_CRF = model_args.use_CRF  ##CR replace from arguments
+    config.use_BiLSTM = False
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name
+        if model_args.tokenizer_name
+        else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        add_prefix_space=True,
+    )
+    model = MODEL_DICT[data_args.task_name][
+        model_args.model_family_name
+    ].from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+    # model.freeze_encoder_layer()
+    print("model")
+    # print(model)
+    if tokenizer.pad_token is None:
+
+        tokenizer.pad_token = tokenizer.eos_token
+        config.pad_token_id = config.eos_token_id
+
+    # Tokenizer check: this script requires a fast tokenizer.
+    # if not isinstance(tokenizer, PreTrainedTokenizerFast):
+    #     raise ValueError(
+    #         "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+    #         "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
+    #         "requirement"
+    #     )
+
+    # Preprocessing the dataset
+    # Padding strategy
+    padding = "max_length" if data_args.pad_to_max_length else False
+
+    # Tokenize all texts and align the labels with them.
+    def tokenize_and_align_labels(examples):
+        tokenized_inputs = tokenizer(
+            examples[text_column_name],
+            padding=padding,
+            truncation=True,
+            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
+            is_split_into_words=True,
+        )
+        labels = []
+        for i, label in enumerate(examples[label_column_name]):
+            word_ids = tokenized_inputs.word_ids(batch_index=i)
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+                # ignored in the loss function.
+                if word_idx is None:
+                    # label_ids.append(-100)
+                    label_ids.append(
+                        2
+                    )  # to avoid error change -100 to 'O' tag i.e. 2 class
+                # We set the label for the first token of each word.
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_to_id[label[word_idx]])
+                # For the other tokens in a word, we set the label to either the current label or -100, depending on
+                # the label_all_tokens flag.
+                else:
+                    label_ids.append(
+                        label_to_id[label[word_idx]]
+                        if data_args.label_all_tokens
+                        else -100
+                    )
+                    # to avoid error change -100 to 'O' tag i.e. 2 class
+                    # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2)
+                previous_word_idx = word_idx
+
+            labels.append(label_ids)
+        if data_args.task_name == "guided":
+            tokenized_inputs["guide_embed"] = examples["guide_embed"]
+        tokenized_inputs["labels"] = labels
+        # tokenized_inputs['paper_id']= examples['paper_id']
+        # tokenized_inputs['extractive_keyphrases']= examples['extractive_keyphrases']
+
+        return tokenized_inputs
+
+    tokenized_datasets = datasets.map(
+        tokenize_and_align_labels,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+        # cache_file_name= data_args.cache_file_name
+    )
+
+    # Data collator
+    data_collator = DataCollatorForTokenClassification(
+        tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
+    )
+
+    from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+    from seqeval.scheme import IOB2, IOB1
+
+    def compute_metrics(p):
+        predictions, labels = p
+        # print(predictions.shape, labels.shape)
+        # if model_args.use_CRF is False:
+        predictions = np.argmax(predictions, axis=2)
+
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+        true_labels = [
+            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+
+        # results = metric.compute(predictions=true_predictions, references=true_labels)
+        results = {}
+        # print("cal precisi")
+        results["overall_precision"] = precision_score(
+            true_labels, true_predictions, mode="strict", scheme=IOB2
+        )
+        results["overall_recall"] = recall_score(
+            true_labels, true_predictions, mode="strict", scheme=IOB2
+        )
+        # print("cal f1")
+        results["overall_f1"] = f1_score(
+            true_labels, true_predictions, mode="strict", scheme=IOB2
+        )
+        results["overall_accuracy"] = accuracy_score(true_labels, true_predictions)
+        if data_args.return_entity_level_metrics:
+            # Unpack nested dictionaries
+            final_results = {}
+            # print("cal entity level mat")
+            for key, value in results.items():
+                if isinstance(value, dict):
+                    for n, v in value.items():
+                        final_results[f"{key}_{n}"] = v
+                else:
+                    final_results[key] = value
+            return final_results
+        else:
+            return {
+                "precision": results["overall_precision"],
+                "recall": results["overall_recall"],
+                "f1": results["overall_f1"],
+                "accuracy": results["overall_accuracy"],
+            }
+
+    # Initialize our Trainer
+    # metric = load_metric("seqeval")
+
+    # def compute_metrics(p):
+    #     predictions, labels = p
+    #     predictions = np.argmax(predictions, axis=2)
+
+    #     # Remove ignored index (special tokens)
+    #     true_predictions = [
+    #         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+    #         for prediction, label in zip(predictions, labels)
+    #     ]
+    #     true_labels = [
+    #         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+    #         for prediction, label in zip(predictions, labels)
+    #     ]
+
+    #     results = metric.compute(predictions=true_predictions, references=true_labels)
+    #     if data_args.return_entity_level_metrics:
+    #         # Unpack nested dictionaries
+    #         final_results = {}
+    #         for key, value in results.items():
+    #             if isinstance(value, dict):
+    #                 for n, v in value.items():
+    #                     final_results[f"{key}_{n}"] = v
+    #             else:
+    #                 final_results[key] = value
+    #         return final_results
+    #     else:
+    #         return {
+    #             "precision": results["overall_precision"],
+    #             "recall": results["overall_recall"],
+    #             "f1": results["overall_f1"],
+    #             "accuracy": results["overall_accuracy"],
+    #         }
+
+    trainer = TRAINER_DICT[data_args.task_name](
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=tokenized_datasets["validation"]
+        if training_args.do_eval
+        else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+
+    # Training
+    if training_args.do_train:
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(
+                os.path.join(training_args.output_dir, "trainer_state.json")
+            )
+
+    # Evaluation
+    results = {}
+    # if training_args.do_eval:
+
+    #     logger.info("*** Evaluate ***")
+
+    #     results = trainer.evaluate()
+
+    #     output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt")
+    #     if trainer.is_world_process_zero():
+    #         with open(output_eval_file, "w") as writer:
+    #             logger.info("***** Eval results *****")
+    #             for key, value in results.items():
+    #                 logger.info(f"  {key} = {value}")
+    #                 writer.write(f"{key} = {value}\n")
+
+    # Predict
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        test_dataset = tokenized_datasets["test"]
+        predictions, labels, metrics = trainer.predict(test_dataset)
+        # if model_args.use_CRF is False:
+        predictions = np.argmax(predictions, axis=2)
+
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+        true_labels = [
+            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+
+        output_test_results_file = os.path.join(
+            training_args.output_dir, "test_results.txt"
+        )
+        if trainer.is_world_process_zero():
+            with open(output_test_results_file, "w") as writer:
+                for key, value in sorted(metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+        # Save predictions
+        def get_kp_from_BIO(examples, i):
+            # kps= []
+            # for i in range(len(prediction)):
+            ids = examples["input_ids"]
+            # print(examples.keys())
+
+            # print(tags)
+            def mmkp(tag_):
+                current_kps = []
+                ckp = []
+                prev_tag = None
+                for j, tag in enumerate(tag_):
+                    id = ids[j]
+
+                    if tag == "O" and len(ckp) > 0:
+
+                        current_kps.append(ckp)
+                        ckp = []
+                    elif tag == "B":
+                        # print(ckp, tag)
+                        if (
+                            tokenizer.convert_ids_to_tokens(id).startswith("##")
+                            or prev_tag == "B"
+                        ):
+                            ckp.append(id)
+                        else:
+                            if len(ckp) > 0:
+                                current_kps.append(ckp)
+                                ckp = []
+
+                            ckp.append(id)
+                            # print(ckp, id)
+
+                    elif tag == "I" and len(ckp) > 0:
+                        ckp.append(id)
+                    prev_tag = tag
+                decoded_kps = []
+                if len(ckp) > 0:
+                    current_kps.append(ckp)
+                if len(current_kps) > 0:
+                    decoded_kps = tokenizer.batch_decode(
+                        current_kps,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=True,
+                    )
+                    # print(decoded_kps)
+                return decoded_kps
+
+            tags = true_predictions[i]
+            decoded_kps = mmkp(tags)
+
+            ttgs = true_labels[i]
+            eekp = mmkp(ttgs)
+
+            # examples['kp_predicted']= decoded_kps
+            examples["kp_predicted"] = list(dict.fromkeys(decoded_kps))
+            examples["eekp"] = list(dict.fromkeys(eekp))
+            # examples['eekp']= eekp
+            # else:
+            #     examples['kp_predicted']= ['<dummy_kp>']
+            examples["id"] = i
+            return examples
+
+        import pandas as pd
+
+        output_test_predictions_file = os.path.join(
+            training_args.output_dir, "test_predictions.csv"
+        )
+        output_test_predictions_BIO_file = os.path.join(
+            training_args.output_dir, "test_predictions_BIO.txt"
+        )
+        if trainer.is_world_process_zero():
+            print(test_dataset, len(test_dataset["paper_id"]))
+            ppid = test_dataset["paper_id"]
+            # ekp= test_dataset['extractive_keyphrases']
+
+            test_dataset = test_dataset.map(
+                get_kp_from_BIO,
+                num_proc=data_args.preprocessing_num_workers,
+                with_indices=True,
+            )
+            #  input_columns= ['paper_id','input_ids','extractive_keyphrases']
+            print(test_dataset, " agian")
+            df = pd.DataFrame.from_dict(
+                {
+                    "id": ppid,
+                    "extractive_keyphrase": test_dataset["eekp"],
+                    "keyphrases": test_dataset["kp_predicted"],
+                }
+            )
+            df.to_csv(output_test_predictions_file, index=False)
+
+            # get BIO tag files
+
+            with open(output_test_predictions_BIO_file, "w") as writer:
+                for prediction in true_predictions:
+                    writer.write(" ".join(prediction) + "\n")
+
+    return results
+
+
+"""# guided long former"""
+# please select one of ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup']
+def longformer_guided_runner():
+    training_args = TrainingArguments(
+        output_dir="/media/nas_mount/Debanjan/amardeep/output/long_sml_oakgx_abs_guided",  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        num_train_epochs=4,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=16,
+        gradient_accumulation_steps=4,
+        do_train=True,
+        do_eval=True,
+        evaluation_strategy="steps",
+        save_steps=1000,
+        eval_steps=1000,
+        # lr_scheduler_type= 'cosine',
+        warmup_steps=100,
+        logging_steps=100
+        # weight_decay =0.001
+    )
+    mdl_args = ModelArguments(
+        model_family_name="longformer",
+        model_name_or_path="allenai/longformer-base-4096",
+        use_CRF=False,
+    )
+    data_args = DataTrainingArguments(
+        task_name="guided",
+        train_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/small_abs_guided/sml_train_abs_oagkx.json",
+        validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/small_abs_guided/sml_test_abs_oagkx.json",
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=8,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+def longformer_runner():
+    training_args = TrainingArguments(
+        output_dir="/media/nas_mount/Debanjan/amardeep/output/longformer_medium_kp20k_try",  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        num_train_epochs=4,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=16,
+        gradient_accumulation_steps=4,
+        do_train=True,
+        do_eval=True,
+        evaluation_strategy="steps",
+        save_steps=1000,
+        eval_steps=1000,
+        # lr_scheduler_type= 'cosine',
+        warmup_steps=200,
+        logging_steps=100
+        # weight_decay =0.001
+    )
+    mdl_args = ModelArguments(
+        model_family_name="longformer",
+        model_name_or_path="allenai/longformer-base-4096",
+        use_CRF=False,
+    )
+    data_args = DataTrainingArguments(
+        task_name="simple",
+        train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json",
+        validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json",
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=8,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+# longformer_modi_runner()
+import os
+
+
+def longformer_guided_predict(ckpt, val, test, out=None):
+    if out is None:
+        out = ckpt
+    training_args = TrainingArguments(
+        output_dir=os.path.join(out, "predict/"),  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        num_train_epochs=3,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=16,
+        gradient_accumulation_steps=2,
+        do_train=False,
+        do_eval=False,
+        do_predict=True,
+        evaluation_strategy="steps",
+        save_steps=750,
+        eval_steps=750,
+        # lr_scheduler_type= 'cosine',
+        warmup_steps=50,
+        logging_steps=50
+        # weight_decay =0.001
+    )
+    mdl_args = ModelArguments(
+        model_family_name="longformer", model_name_or_path=ckpt, use_CRF=False
+    )
+    data_args = DataTrainingArguments(
+        task_name="guided",
+        # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json",
+        validation_file=val,
+        test_file=test,
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=5,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+def longformer_predict(ckpt, val, test, out=None):
+    if out is None:
+        out = ckpt
+    training_args = TrainingArguments(
+        output_dir=os.path.join(out, "predict/"),  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        num_train_epochs=3,
+        per_device_train_batch_size=1,
+        per_device_eval_batch_size=8,
+        gradient_accumulation_steps=1,
+        do_train=False,
+        do_eval=False,
+        do_predict=True,
+        evaluation_strategy="steps",
+        save_steps=750,
+        eval_steps=750,
+        # lr_scheduler_type= 'cosine',
+        warmup_steps=50,
+        logging_steps=50
+        # weight_decay =0.001
+    )
+    mdl_args = ModelArguments(
+        model_family_name="longformer", model_name_or_path=ckpt, use_CRF=False
+    )
+    data_args = DataTrainingArguments(
+        task_name="simple",
+        # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json",
+        validation_file=val,
+        test_file=test,
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=8,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+"""# guided BERT"""
+
+
+def bert_guided_runner():
+    training_args = TrainingArguments(
+        output_dir="/media/nas_mount/Debanjan/amardeep/output/bert_inspec_text_rank_bert",  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        num_train_epochs=4,
+        per_device_train_batch_size=4,
+        per_device_eval_batch_size=16,
+        do_train=True,
+        do_eval=True,
+        evaluation_strategy="steps",
+        save_steps=200,
+        eval_steps=200,
+        logging_steps=50,
+        # weight_decay =0.001
+    )
+    mdl_args = ModelArguments(
+        model_family_name="bert", model_name_or_path="bert-base-uncased", use_CRF=False
+    )
+    data_args = DataTrainingArguments(
+        task_name="guided",
+        train_file="/media/nas_mount/Debanjan/amardeep/proc_data/inspec/conll_train_textrank_inspec.json",
+        validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/inspec/conll_valid_textrank_inspec.json",
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=5,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+def bert_guided_predict(ckpt, val, test, out=None):
+    if out is None:
+        out = ckpt
+    training_args = TrainingArguments(
+        output_dir=os.path.join(out, "predict/"),  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        per_device_eval_batch_size=64,
+        # gradient_accumulation_steps=2,
+        do_train=False,
+        do_eval=False,
+        do_predict=True,
+    )
+    mdl_args = ModelArguments(
+        model_family_name="bert", model_name_or_path=ckpt, use_CRF=False
+    )
+    data_args = DataTrainingArguments(
+        task_name="guided",
+        # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json",
+        validation_file=val,
+        test_file=test,
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=5,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+# bert_modi_runner()
+
+"""#BERT [for testing PoV]
+
+## bert plane
+"""
+
+
+def bert_runner():
+    training_args = TrainingArguments(
+        output_dir="/media/nas_mount/Debanjan/amardeep/output/bert_small_oagkx",  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        num_train_epochs=4,
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=32,
+        # gradient_accumulation_steps=2,
+        do_train=True,
+        do_eval=True,
+        evaluation_strategy="steps",
+        save_steps=750,
+        eval_steps=750,
+        # lr_scheduler_type= 'cosine',
+        warmup_steps=80,
+        logging_steps=100
+        # weight_decay =0.001
+    )
+    mdl_args = ModelArguments(
+        model_family_name="bert", model_name_or_path="bert-base-uncased", use_CRF=False
+    )
+    data_args = DataTrainingArguments(
+        task_name="simple",
+        train_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/conll_small_train_oagkx.json",
+        validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/conll_small_valid_oagkx.json",
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=8,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+def bert_predict(ckpt, val, test, out=None):
+    if out is None:
+        out = ckpt
+    training_args = TrainingArguments(
+        output_dir=os.path.join(out, "predict/"),  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        per_device_eval_batch_size=16,
+        # gradient_accumulation_steps=2,
+        do_train=False,
+        do_eval=False,
+        do_predict=True,
+    )
+    mdl_args = ModelArguments(
+        model_family_name="bert", model_name_or_path=ckpt, use_CRF=False
+    )
+    data_args = DataTrainingArguments(
+        task_name="simple",
+        # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json",
+        validation_file=val,
+        test_file=test,
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=5,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+for i in range(torch.cuda.device_count()):
+    print("davailabel gpus are")
+    print(torch.cuda.get_device_name(i))
+
+
+def bigbird_runner():
+    training_args = TrainingArguments(
+        output_dir="/media/nas_mount/Debanjan/amardeep/output/bigbird_medium_kp20k",  # todo
+        learning_rate=3e-5,
+        overwrite_output_dir=True,
+        num_train_epochs=4,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=2,
+        gradient_accumulation_steps=4,
+        do_train=True,
+        do_eval=True,
+        evaluation_strategy="steps",
+        save_steps=10,
+        eval_steps=10,
+        # lr_scheduler_type= 'cosine',
+        warmup_steps=200,
+        logging_steps=100
+        # weight_decay =0.001
+    )
+    mdl_args = ModelArguments(
+        model_family_name="bigbird",
+        model_name_or_path="google/bigbird-roberta-base",
+        use_CRF=False,
+    )
+
+    data_args = DataTrainingArguments(
+        task_name="simple",
+        train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json",
+        validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json",
+        pad_to_max_length=True,
+        overwrite_cache=True,
+        label_all_tokens=True,
+        preprocessing_num_workers=8,
+        return_entity_level_metrics=True,
+    )
+
+    main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args)
+
+
+# bigbird_runner()
+
+# longformer_guided_predict()
+# bert_guided_predict()
+# bert_modi_runner()
+longformer_runner()
+# bert_guided_runner()
+# bert_crf_runner()
+# bert_runner()
+# longformer_guided_runner()

From 2e04bfe1807b359e5000b06de525885e42f2c9a6 Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Wed, 26 Jan 2022 22:17:55 +0530
Subject: [PATCH 06/17] f formatting and re arch

---
 dlkp/kp_metrics/metrics.py | 52 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 dlkp/kp_metrics/metrics.py

diff --git a/dlkp/kp_metrics/metrics.py b/dlkp/kp_metrics/metrics.py
new file mode 100644
index 0000000..73328d2
--- /dev/null
+++ b/dlkp/kp_metrics/metrics.py
@@ -0,0 +1,52 @@
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+from seqeval.scheme import IOB2, IOB1
+
+
+def compute_metrics(p):
+    predictions, labels = p
+    # print(predictions.shape, labels.shape)
+    # if model_args.use_CRF is False:
+    predictions = np.argmax(predictions, axis=2)
+
+    # Remove ignored index (special tokens)
+    true_predictions = [
+        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+    true_labels = [
+        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+
+    # results = metric.compute(predictions=true_predictions, references=true_labels)
+    results = {}
+    # print("cal precisi")
+    results["overall_precision"] = precision_score(
+        true_labels, true_predictions, mode="strict", scheme=IOB2
+    )
+    results["overall_recall"] = recall_score(
+        true_labels, true_predictions, mode="strict", scheme=IOB2
+    )
+    # print("cal f1")
+    results["overall_f1"] = f1_score(
+        true_labels, true_predictions, mode="strict", scheme=IOB2
+    )
+    results["overall_accuracy"] = accuracy_score(true_labels, true_predictions)
+    if data_args.return_entity_level_metrics:
+        # Unpack nested dictionaries
+        final_results = {}
+        # print("cal entity level mat")
+        for key, value in results.items():
+            if isinstance(value, dict):
+                for n, v in value.items():
+                    final_results[f"{key}_{n}"] = v
+            else:
+                final_results[key] = value
+        return final_results
+    else:
+        return {
+            "precision": results["overall_precision"],
+            "recall": results["overall_recall"],
+            "f1": results["overall_f1"],
+            "accuracy": results["overall_accuracy"],
+        }

From 6c8fda818f9e86dec592da56988e5015529fc1dd Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Wed, 26 Jan 2022 22:28:18 +0530
Subject: [PATCH 07/17] f formatting and re arch

---
 dlkp/models/ke/kpe.py | 74 ++-----------------------------------------
 1 file changed, 2 insertions(+), 72 deletions(-)

diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index 556048b..721d284 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -75,7 +75,6 @@
 def main_run_kpe(model_args, data_args, training_args):
 
     # See all possible arguments in src/transformers/training_args.py
-   
 
     # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
@@ -110,9 +109,7 @@ def main_run_kpe(model_args, data_args, training_args):
         datefmt="%m/%d/%Y %H:%M:%S",
         handlers=[logging.StreamHandler(sys.stdout)],
     )
-    logger.setLevel(
-        logging.INFO if is_main_process(training_args.local_rank) else logging.INFO
-    )
+    logger.setLevel(logging.INFO)
     # logger.set_global_logging_level(logging.INFO)
 
     # Log on each process the small summary:
@@ -130,16 +127,6 @@ def main_run_kpe(model_args, data_args, training_args):
     # Set seed before initializing model.
     set_seed(training_args.seed)
 
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    ## get dataset in here
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
@@ -154,12 +141,7 @@ def main_run_kpe(model_args, data_args, training_args):
         if data_args.test_file is not None:
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(
-            extension, data_files=data_files
-        )  ##CR get dataset in here
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
+        datasets = load_dataset(extension, data_files=data_files)
     if training_args.do_train:
         column_names = datasets["train"].column_names
         features = datasets["train"].features
@@ -303,58 +285,6 @@ def tokenize_and_align_labels(examples):
         tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
     )
 
-    from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
-    from seqeval.scheme import IOB2, IOB1
-
-    def compute_metrics(p):
-        predictions, labels = p
-        # print(predictions.shape, labels.shape)
-        # if model_args.use_CRF is False:
-        predictions = np.argmax(predictions, axis=2)
-
-        # Remove ignored index (special tokens)
-        true_predictions = [
-            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
-            for prediction, label in zip(predictions, labels)
-        ]
-        true_labels = [
-            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
-            for prediction, label in zip(predictions, labels)
-        ]
-
-        # results = metric.compute(predictions=true_predictions, references=true_labels)
-        results = {}
-        # print("cal precisi")
-        results["overall_precision"] = precision_score(
-            true_labels, true_predictions, mode="strict", scheme=IOB2
-        )
-        results["overall_recall"] = recall_score(
-            true_labels, true_predictions, mode="strict", scheme=IOB2
-        )
-        # print("cal f1")
-        results["overall_f1"] = f1_score(
-            true_labels, true_predictions, mode="strict", scheme=IOB2
-        )
-        results["overall_accuracy"] = accuracy_score(true_labels, true_predictions)
-        if data_args.return_entity_level_metrics:
-            # Unpack nested dictionaries
-            final_results = {}
-            # print("cal entity level mat")
-            for key, value in results.items():
-                if isinstance(value, dict):
-                    for n, v in value.items():
-                        final_results[f"{key}_{n}"] = v
-                else:
-                    final_results[key] = value
-            return final_results
-        else:
-            return {
-                "precision": results["overall_precision"],
-                "recall": results["overall_recall"],
-                "f1": results["overall_f1"],
-                "accuracy": results["overall_accuracy"],
-            }
-
     # Initialize our Trainer
     # metric = load_metric("seqeval")
 

From d847f7f014236ad50c4e62077c232af9dea8218e Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Wed, 26 Jan 2022 22:28:27 +0530
Subject: [PATCH 08/17] f formatting and re arch

---
 dlkp/datasets/pre_process.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 dlkp/datasets/pre_process.py

diff --git a/dlkp/datasets/pre_process.py b/dlkp/datasets/pre_process.py
new file mode 100644
index 0000000..e69de29

From 5d08559abc58dd6a5b3e00c4efd2f55c26753372 Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Sat, 5 Feb 2022 01:19:28 +0530
Subject: [PATCH 09/17] added AutoModel for CRf classification ca etc

---
 dlkp/kp_metrics/metrics.py                          | 13 ++++++++-----
 dlkp/models/ke/crf/crf_utils.py                     |  6 +++++-
 dlkp/models/ke/extraction_utils.py                  |  2 +-
 .../ke/transformer/token_classification_models.py   |  3 ---
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/dlkp/kp_metrics/metrics.py b/dlkp/kp_metrics/metrics.py
index 73328d2..33cc2e0 100644
--- a/dlkp/kp_metrics/metrics.py
+++ b/dlkp/kp_metrics/metrics.py
@@ -1,20 +1,23 @@
 from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from seqeval.scheme import IOB2, IOB1
+import numpy as np
 
 
-def compute_metrics(p):
-    predictions, labels = p
+def compute_metrics(
+    predictions, labels, return_entity_level_metrics=True, ignore_value=-100
+):
+    # predictions, labels = p
     # print(predictions.shape, labels.shape)
     # if model_args.use_CRF is False:
     predictions = np.argmax(predictions, axis=2)
 
     # Remove ignored index (special tokens)
     true_predictions = [
-        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+        [p for (p, l) in zip(prediction, label) if l != ignore_value]
         for prediction, label in zip(predictions, labels)
     ]
     true_labels = [
-        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+        [l for (p, l) in zip(prediction, label) if l != ignore_value]
         for prediction, label in zip(predictions, labels)
     ]
 
@@ -32,7 +35,7 @@ def compute_metrics(p):
         true_labels, true_predictions, mode="strict", scheme=IOB2
     )
     results["overall_accuracy"] = accuracy_score(true_labels, true_predictions)
-    if data_args.return_entity_level_metrics:
+    if return_entity_level_metrics:
         # Unpack nested dictionaries
         final_results = {}
         # print("cal entity level mat")
diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py
index 1d1c240..45e3d6b 100644
--- a/dlkp/models/ke/crf/crf_utils.py
+++ b/dlkp/models/ke/crf/crf_utils.py
@@ -1,15 +1,19 @@
 """
 Conditional random field utilis file 
 """
-from typing import List, Tuple, Dict, Union
+from typing import List, Tuple, Dict, Union, Optional
 
 import torch
+import math
+import logging
 
 # from allennlp.common.checks import ConfigurationError
 # import allennlp.nn.util as util
 
 VITERBI_DECODING = Tuple[List[int], float]  # a list of tags, and a viterbi score
 
+logger = logging.get_logger(__name__)
+
 
 def allowed_transitions(
     constraint_type: str, labels: Dict[int, str]
diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py
index 7f34719..7d03539 100644
--- a/dlkp/models/ke/extraction_utils.py
+++ b/dlkp/models/ke/extraction_utils.py
@@ -49,7 +49,7 @@ class ModelArguments:
         default=False,
         metadata={"help": "wether to use CRF on top of the classifier"},
     )
-    use_BiLSTM: bool = field(
+    use_BiLSTM: bool = field(  # not necessary
         default=False,
         metadata={"help": "use BiLSTM in sequence classification"},
     )
diff --git a/dlkp/models/ke/transformer/token_classification_models.py b/dlkp/models/ke/transformer/token_classification_models.py
index d3b628c..d286ac1 100644
--- a/dlkp/models/ke/transformer/token_classification_models.py
+++ b/dlkp/models/ke/transformer/token_classification_models.py
@@ -89,6 +89,3 @@ def forward(
         # if not return_dict:
         output = (logits,) + outputs[2:]
         return ((loss,) + output) if loss is not None else output
-
-
-s

From 4492c25f2bfb544ee583ff354fa87ca7436d4e7e Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Sat, 5 Feb 2022 01:19:56 +0530
Subject: [PATCH 10/17] added AutoModel for CRf classification

---
 dlkp/models/ke/transformer/crf_models.py | 71 ++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
index da26337..0d8624a 100644
--- a/dlkp/models/ke/transformer/crf_models.py
+++ b/dlkp/models/ke/transformer/crf_models.py
@@ -13,6 +13,77 @@
 from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel
 
 
+class AutoCRFforTokenClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = AutoModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # self.crf= nn.Linear(config.num_labels,1)
+        # self.crf= ConditionalRandomField(self.num_labels)
+        self.crf = ConditionalRandomField(
+            self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"}
+        )
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = -self.crf(logits, labels, attention_mask)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        # print(self.crf.transitions)
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def freeze_till_clf(self):
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        for param in self.dropout.parameters():
+            param.requires_grad = False
+        for param in self.classifier.parameters():
+            param.requires_grad = False
+
+    def freeze_encoder_layer(self):
+        for param in self.bert.parameters():
+            param.requires_grad = False
+
+
 class BERT_CRFforTokenClassification(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)

From 5fbc15ad369ab8244c2bc9d4f516f4e3e04eb6d2 Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Sat, 5 Feb 2022 15:39:49 +0530
Subject: [PATCH 11/17] add runner

---
 dlkp/models/ke/extraction_utils.py | 19 +++----
 dlkp/models/ke/kpe.py              | 87 ++++++++++--------------------
 examples/ke/run_auto_token_ke.py   | 38 +++++++++++++
 3 files changed, 77 insertions(+), 67 deletions(-)
 create mode 100644 examples/ke/run_auto_token_ke.py

diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py
index 7d03539..cf913be 100644
--- a/dlkp/models/ke/extraction_utils.py
+++ b/dlkp/models/ke/extraction_utils.py
@@ -11,16 +11,17 @@ class ModelArguments:
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
     """
 
-    model_family_name: str = field(
-        metadata={
-            "help": "name of the family of model, bert, longformer, reformer etc."
-        }
-    )
     model_name_or_path: str = field(
         metadata={
             "help": "Path to pretrained model or model identifier from huggingface.co/models"
         }
     )
+    model_family_name: str = field(
+        default="auto",
+        metadata={
+            "help": "name of the family of model, bert, longformer, reformer etc."
+        },
+    )
     config_name: Optional[str] = field(
         default=None,
         metadata={
@@ -62,7 +63,7 @@ class DataTrainingArguments:
     """
 
     task_name: Optional[str] = field(
-        default="simple", metadata={"help": "The name of the task simple, crf"}
+        default="token", metadata={"help": "The name of the task token, crf"}
     )
 
     train_file: Optional[str] = field(
@@ -98,7 +99,7 @@ class DataTrainingArguments:
         },
     )
     label_all_tokens: bool = field(
-        default=False,
+        default=True,
         metadata={
             "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
             "one (in which case the other tokens will have a padding index)."
@@ -111,11 +112,11 @@ class DataTrainingArguments:
         },
     )
     dataset_name: Optional[str] = field(
-        default=None,
+        default="midas/inspec",
         metadata={"help": "The name of the dataset to use (via the datasets library)."},
     )
     dataset_config_name: Optional[str] = field(
-        default=None,
+        default="extraction",
         metadata={
             "help": "The configuration name of the dataset to use (via the datasets library)."
         },
diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index 721d284..643d3c2 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -45,16 +45,21 @@
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 
-from transformer.crf_models import BERT_CRFforTokenClassification
+from transformer.crf_models import (
+    BERT_CRFforTokenClassification,
+    AutoCRFforTokenClassification,
+)
 from transformer.token_classification_models import LongformerForTokenClassification
 from crf.crf_trainer import CRF_Trainer
 from extraction_utils import ModelArguments, DataTrainingArguments
+from kp_metrics.metrics import compute_metrics
 
 logger = logging.getLogger(__name__)
 
 
 CRF_MODEL_DICT = {
     "bert": BERT_CRFforTokenClassification,
+    "auto": AutoCRFforTokenClassification,
     # "longformer": Longformer_CRFforTokenClassification,
 }
 TOKEN_MODEL_DICT = {
@@ -63,16 +68,16 @@
     # "reformer": ReformerForTokenClassification,
 }
 
-MODEL_DICT = {"crf": CRF_MODEL_DICT, "simple": TOKEN_MODEL_DICT}
+MODEL_DICT = {"crf": CRF_MODEL_DICT, "token": TOKEN_MODEL_DICT}
 
 
 TRAINER_DICT = {
     "crf": CRF_Trainer,
-    "simple": Trainer,
+    "token": Trainer,
 }
 
 
-def main_run_kpe(model_args, data_args, training_args):
+def run_kpe(model_args, data_args, training_args):
 
     # See all possible arguments in src/transformers/training_args.py
 
@@ -148,11 +153,13 @@ def main_run_kpe(model_args, data_args, training_args):
     else:
         column_names = datasets["validation"].column_names
         features = datasets["validation"].features
-    text_column_name = "text" if "text" in column_names else column_names[0]
-    label_column_name = "BIO_tags" if "BIO_tags" in column_names else column_names[1]
+    text_column_name = (
+        "document" if "document" in column_names else column_names[1]
+    )  # either document or 2nd column as text i/p
+    label_column_name = (
+        "doc_bio_tags" if "doc_bio_tags" in column_names else column_names[2]
+    )  # either doc_bio_tags column should be available or 3 rd columns will be considered as tag
 
-    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
-    # unique labels.
     def get_label_list(labels):
         unique_labels = set()
         for label in labels:
@@ -173,7 +180,7 @@ def get_label_list(labels):
         )
         label_to_id = {l: i for i, l in enumerate(label_list)}
     num_labels = len(label_list)
-    print(label_to_id)
+    print("label to id", label_to_id)
     id2tag = {}
     for k in label_to_id.keys():
         id2tag[label_to_id[k]] = k
@@ -190,7 +197,7 @@ def get_label_list(labels):
         cache_dir=model_args.cache_dir,
     )
     config.use_CRF = model_args.use_CRF  ##CR replace from arguments
-    config.use_BiLSTM = False
+    config.use_BiLSTM = model_args.use_BiLSTM
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name
         if model_args.tokenizer_name
@@ -210,7 +217,6 @@ def get_label_list(labels):
     print("model")
     # print(model)
     if tokenizer.pad_token is None:
-
         tokenizer.pad_token = tokenizer.eos_token
         config.pad_token_id = config.eos_token_id
 
@@ -286,40 +292,6 @@ def tokenize_and_align_labels(examples):
     )
 
     # Initialize our Trainer
-    # metric = load_metric("seqeval")
-
-    # def compute_metrics(p):
-    #     predictions, labels = p
-    #     predictions = np.argmax(predictions, axis=2)
-
-    #     # Remove ignored index (special tokens)
-    #     true_predictions = [
-    #         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
-    #         for prediction, label in zip(predictions, labels)
-    #     ]
-    #     true_labels = [
-    #         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
-    #         for prediction, label in zip(predictions, labels)
-    #     ]
-
-    #     results = metric.compute(predictions=true_predictions, references=true_labels)
-    #     if data_args.return_entity_level_metrics:
-    #         # Unpack nested dictionaries
-    #         final_results = {}
-    #         for key, value in results.items():
-    #             if isinstance(value, dict):
-    #                 for n, v in value.items():
-    #                     final_results[f"{key}_{n}"] = v
-    #             else:
-    #                 final_results[key] = value
-    #         return final_results
-    #     else:
-    #         return {
-    #             "precision": results["overall_precision"],
-    #             "recall": results["overall_recall"],
-    #             "f1": results["overall_f1"],
-    #             "accuracy": results["overall_accuracy"],
-    #         }
 
     trainer = TRAINER_DICT[data_args.task_name](
         model=model,
@@ -359,19 +331,18 @@ def tokenize_and_align_labels(examples):
 
     # Evaluation
     results = {}
-    # if training_args.do_eval:
-
-    #     logger.info("*** Evaluate ***")
-
-    #     results = trainer.evaluate()
-
-    #     output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt")
-    #     if trainer.is_world_process_zero():
-    #         with open(output_eval_file, "w") as writer:
-    #             logger.info("***** Eval results *****")
-    #             for key, value in results.items():
-    #                 logger.info(f"  {key} = {value}")
-    #                 writer.write(f"{key} = {value}\n")
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        results = trainer.evaluate()
+        output_eval_file = os.path.join(
+            training_args.output_dir, "eval_results_KPE.txt"
+        )
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in results.items():
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
 
     # Predict
     if training_args.do_predict:
diff --git a/examples/ke/run_auto_token_ke.py b/examples/ke/run_auto_token_ke.py
new file mode 100644
index 0000000..f84ff7e
--- /dev/null
+++ b/examples/ke/run_auto_token_ke.py
@@ -0,0 +1,38 @@
+from dlkp.models.ke.kpe import run_kpe, TrainingArguments
+from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments
+
+training_args = TrainingArguments(
+    output_dir="/media/nas_mount/Debanjan/amardeep/output/longformer_medium_kp20k_try",  # todo
+    learning_rate=3e-5,
+    overwrite_output_dir=True,
+    num_train_epochs=4,
+    per_device_train_batch_size=2,
+    per_device_eval_batch_size=16,
+    gradient_accumulation_steps=4,
+    do_train=True,
+    do_eval=True,
+    evaluation_strategy="steps",
+    save_steps=1000,
+    eval_steps=1000,
+    # lr_scheduler_type= 'cosine',
+    warmup_steps=200,
+    logging_steps=100
+    # weight_decay =0.001
+)
+mdl_args = ModelArguments(
+    model_family_name="auto",
+    model_name_or_path="roberta-base",
+    use_CRF=False
+)
+data_args = DataTrainingArguments(
+    task_name="token",
+    # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json",
+    # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json",
+    dataset_name='midas/inspec',
+    dataset_config_name='extraction'
+    pad_to_max_length=True,
+    overwrite_cache=True,
+    label_all_tokens=True,
+    preprocessing_num_workers=8,
+    return_entity_level_metrics=True,
+)

From 9adf81e916b6a07fa04274a618af64c4f6f2e29a Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Sat, 5 Feb 2022 17:16:54 +0530
Subject: [PATCH 12/17] bug fixing for KPE

---
 dlkp/kp_metrics/metrics.py                    | 27 +++++++++----------
 dlkp/models/ke/crf/crf.py                     |  2 +-
 dlkp/models/ke/crf/crf_utils.py               |  2 +-
 dlkp/models/ke/kpe.py                         | 20 +++++++-------
 dlkp/models/ke/transformer/crf_models.py      |  2 +-
 fre_usec_cmd.txt                              |  2 ++
 ...n_auto_token_ke.py => run_auto_token_ke.py | 25 ++++++++---------
 7 files changed, 42 insertions(+), 38 deletions(-)
 create mode 100644 fre_usec_cmd.txt
 rename examples/ke/run_auto_token_ke.py => run_auto_token_ke.py (63%)

diff --git a/dlkp/kp_metrics/metrics.py b/dlkp/kp_metrics/metrics.py
index 33cc2e0..5901ec4 100644
--- a/dlkp/kp_metrics/metrics.py
+++ b/dlkp/kp_metrics/metrics.py
@@ -3,37 +3,36 @@
 import numpy as np
 
 
-def compute_metrics(
-    predictions, labels, return_entity_level_metrics=True, ignore_value=-100
-):
-    # predictions, labels = p
-    # print(predictions.shape, labels.shape)
+def compute_metrics(p):
+    return_entity_level_metrics = False
+    ignore_value = -100
+    predictions, labels = p
+    label_to_id = {"B": 0, "I": 1, "O": 2}
+    id_to_label = ["B", "I", "O"]
     # if model_args.use_CRF is False:
     predictions = np.argmax(predictions, axis=2)
+    # print(predictions.shape, labels.shape)
 
     # Remove ignored index (special tokens)
     true_predictions = [
-        [p for (p, l) in zip(prediction, label) if l != ignore_value]
+        [id_to_label[p] for (p, l) in zip(prediction, label) if l != ignore_value]
         for prediction, label in zip(predictions, labels)
     ]
     true_labels = [
-        [l for (p, l) in zip(prediction, label) if l != ignore_value]
+        [id_to_label[l] for (p, l) in zip(prediction, label) if l != ignore_value]
         for prediction, label in zip(predictions, labels)
     ]
 
     # results = metric.compute(predictions=true_predictions, references=true_labels)
     results = {}
     # print("cal precisi")
+    # mode="strict"
     results["overall_precision"] = precision_score(
-        true_labels, true_predictions, mode="strict", scheme=IOB2
-    )
-    results["overall_recall"] = recall_score(
-        true_labels, true_predictions, mode="strict", scheme=IOB2
+        true_labels, true_predictions, scheme=IOB2
     )
+    results["overall_recall"] = recall_score(true_labels, true_predictions, scheme=IOB2)
     # print("cal f1")
-    results["overall_f1"] = f1_score(
-        true_labels, true_predictions, mode="strict", scheme=IOB2
-    )
+    results["overall_f1"] = f1_score(true_labels, true_predictions, scheme=IOB2)
     results["overall_accuracy"] = accuracy_score(true_labels, true_predictions)
     if return_entity_level_metrics:
         # Unpack nested dictionaries
diff --git a/dlkp/models/ke/crf/crf.py b/dlkp/models/ke/crf/crf.py
index 2ab2181..27786b5 100644
--- a/dlkp/models/ke/crf/crf.py
+++ b/dlkp/models/ke/crf/crf.py
@@ -1,6 +1,6 @@
 # add models having crf classification layer with option of bilstm layers
 
-from crf_utils import *
+from .crf_utils import *
 from typing import List, Tuple, Dict, Union
 
 import torch
diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py
index 45e3d6b..e9e3818 100644
--- a/dlkp/models/ke/crf/crf_utils.py
+++ b/dlkp/models/ke/crf/crf_utils.py
@@ -12,7 +12,7 @@
 
 VITERBI_DECODING = Tuple[List[int], float]  # a list of tags, and a viterbi score
 
-logger = logging.get_logger(__name__)
+# logger = logging.get_logger(__name__)
 
 
 def allowed_transitions(
diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index 643d3c2..a137b28 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -45,14 +45,17 @@
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 
-from transformer.crf_models import (
+from dlkp.models.ke.transformer.crf_models import (
     BERT_CRFforTokenClassification,
     AutoCRFforTokenClassification,
 )
-from transformer.token_classification_models import LongformerForTokenClassification
-from crf.crf_trainer import CRF_Trainer
-from extraction_utils import ModelArguments, DataTrainingArguments
-from kp_metrics.metrics import compute_metrics
+from dlkp.models.ke.transformer.token_classification_models import (
+    LongformerForTokenClassification,
+)
+from dlkp.models.ke.crf.crf_trainer import CRF_Trainer
+
+# from extraction_utils import ModelArguments, DataTrainingArguments
+from dlkp.kp_metrics.metrics import compute_metrics
 
 logger = logging.getLogger(__name__)
 
@@ -64,6 +67,7 @@
 }
 TOKEN_MODEL_DICT = {
     "bert": BertForTokenClassification,
+    "auto": AutoModelForTokenClassification
     # "longformer": LongformerForTokenClassification,
     # "reformer": ReformerForTokenClassification,
 }
@@ -250,10 +254,8 @@ def tokenize_and_align_labels(examples):
                 # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                 # ignored in the loss function.
                 if word_idx is None:
-                    # label_ids.append(-100)
-                    label_ids.append(
-                        2
-                    )  # to avoid error change -100 to 'O' tag i.e. 2 class
+                    label_ids.append(-100)
+                    # label_ids.append(2)  # to avoid error change -100 to 'O' tag i.e. 2 class
                 # We set the label for the first token of each word.
                 elif word_idx != previous_word_idx:
                     label_ids.append(label_to_id[label[word_idx]])
diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
index 0d8624a..db8c4d6 100644
--- a/dlkp/models/ke/transformer/crf_models.py
+++ b/dlkp/models/ke/transformer/crf_models.py
@@ -9,7 +9,7 @@
 )
 from transformers.modeling_outputs import TokenClassifierOutput
 import collections
-from crf import ConditionalRandomField
+from dlkp.models.ke.crf.crf import ConditionalRandomField
 from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel
 
 
diff --git a/fre_usec_cmd.txt b/fre_usec_cmd.txt
new file mode 100644
index 0000000..249a440
--- /dev/null
+++ b/fre_usec_cmd.txt
@@ -0,0 +1,2 @@
+source ../.dlkp_venv/bin/activate
+conda deactivate
\ No newline at end of file
diff --git a/examples/ke/run_auto_token_ke.py b/run_auto_token_ke.py
similarity index 63%
rename from examples/ke/run_auto_token_ke.py
rename to run_auto_token_ke.py
index f84ff7e..cb06a20 100644
--- a/examples/ke/run_auto_token_ke.py
+++ b/run_auto_token_ke.py
@@ -1,38 +1,39 @@
+from statistics import mode
 from dlkp.models.ke.kpe import run_kpe, TrainingArguments
 from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments
 
 training_args = TrainingArguments(
-    output_dir="/media/nas_mount/Debanjan/amardeep/output/longformer_medium_kp20k_try",  # todo
+    output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_debug",  # todo
     learning_rate=3e-5,
     overwrite_output_dir=True,
-    num_train_epochs=4,
-    per_device_train_batch_size=2,
+    num_train_epochs=5,
+    per_device_train_batch_size=4,
     per_device_eval_batch_size=16,
-    gradient_accumulation_steps=4,
+    # gradient_accumulation_steps=4,
     do_train=True,
     do_eval=True,
     evaluation_strategy="steps",
     save_steps=1000,
-    eval_steps=1000,
+    eval_steps=100,
     # lr_scheduler_type= 'cosine',
-    warmup_steps=200,
+    # warmup_steps=200,
     logging_steps=100
     # weight_decay =0.001
 )
 mdl_args = ModelArguments(
-    model_family_name="auto",
-    model_name_or_path="roberta-base",
-    use_CRF=False
+    model_family_name="auto", model_name_or_path="roberta-base", use_CRF=False
 )
 data_args = DataTrainingArguments(
     task_name="token",
     # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json",
     # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json",
-    dataset_name='midas/inspec',
-    dataset_config_name='extraction'
+    dataset_name="midas/inspec",
+    dataset_config_name="extraction",
     pad_to_max_length=True,
     overwrite_cache=True,
     label_all_tokens=True,
     preprocessing_num_workers=8,
-    return_entity_level_metrics=True,
+    # return_entity_level_metrics=True,
 )
+
+run_kpe(mdl_args, data_args, training_args)

From c022b7d70c1605559081cd6d94f04b24a19e546f Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Sat, 5 Feb 2022 20:30:02 +0530
Subject: [PATCH 13/17] crf bug fixes

---
 dlkp/models/ke/kpe.py                    | 1 +
 dlkp/models/ke/transformer/crf_models.py | 4 ++--
 fre_usec_cmd.txt                         | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index a137b28..06dbaeb 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -183,6 +183,7 @@ def get_label_list(labels):
             else datasets["validation"][label_column_name]
         )
         label_to_id = {l: i for i, l in enumerate(label_list)}
+    label_to_id = {"B": 0, "I": 1, "O": 2}
     num_labels = len(label_list)
     print("label to id", label_to_id)
     id2tag = {}
diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
index db8c4d6..b0c4c68 100644
--- a/dlkp/models/ke/transformer/crf_models.py
+++ b/dlkp/models/ke/transformer/crf_models.py
@@ -17,7 +17,7 @@ class AutoCRFforTokenClassification(BertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.bert = AutoModel(config)
+        self.base_model = AutoModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         # self.crf= nn.Linear(config.num_labels,1)
@@ -43,7 +43,7 @@ def forward(
             return_dict if return_dict is not None else self.config.use_return_dict
         )
 
-        outputs = self.bert(
+        outputs = self.base_model(
             input_ids,
             position_ids=position_ids,
             attention_mask=attention_mask,
diff --git a/fre_usec_cmd.txt b/fre_usec_cmd.txt
index 249a440..21665ff 100644
--- a/fre_usec_cmd.txt
+++ b/fre_usec_cmd.txt
@@ -1,2 +1,3 @@
 source ../.dlkp_venv/bin/activate
-conda deactivate
\ No newline at end of file
+conda deactivate
+CUDA_VISIBLE_DEVICES=1 python run_auto_token_ke.py
\ No newline at end of file

From d72e67fd679ce70bf7e65a5811a8567da4b29863 Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Sun, 6 Feb 2022 23:17:45 +0530
Subject: [PATCH 14/17] fixed crf bugs

---
 dlkp/models/ke/crf/crf_trainer.py        | 22 ++++++++-------
 dlkp/models/ke/transformer/crf_models.py | 35 ++++++++++++++----------
 run_auto_token_ke.py                     | 10 +++----
 3 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/dlkp/models/ke/crf/crf_trainer.py b/dlkp/models/ke/crf/crf_trainer.py
index 5db3178..073f29e 100644
--- a/dlkp/models/ke/crf/crf_trainer.py
+++ b/dlkp/models/ke/crf/crf_trainer.py
@@ -208,24 +208,26 @@ def compute_loss(self, model, inputs, return_outputs=False):
 
         Subclass and override for custom behavior.
         """
-        # if self.label_smoother is not None and "labels" in inputs:
+        # labels = inputs.pop("labels")
+        # assert "labels" in inputs
+        # if "labels" in inputs:
         #     labels = inputs.pop("labels")
         # else:
-        labels = None
+        # labels = None
         # print(model)
-        # assert "labels" in inputs
-        # print(type(inputs),inputs)
+        # print(type(inputs), inputs.keys())
         outputs = model(**inputs)
         # Save past state if it exists
         # TODO: this needs to be fixed and made cleaner later.
         if self.args.past_index >= 0:
             self._past = outputs[self.args.past_index]
 
-        if labels is not None:
-            loss = self.label_smoother(outputs, labels)
-        else:
-            # We don't use .loss here since the model may return tuples instead of ModelOutput.
-            # print(outputs)
-            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+        # if labels is not None:
+        #     # print("labels is not None")
+        #     loss = self.label_smoother(outputs, labels)
+        # else:
+        #     # We don't use .loss here since the model may return tuples instead of ModelOutput.
+        # print(outputs.keys())
+        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
         # print("loss is ", loss)
         return (loss, outputs) if return_outputs else loss
diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
index b0c4c68..5d2d4db 100644
--- a/dlkp/models/ke/transformer/crf_models.py
+++ b/dlkp/models/ke/transformer/crf_models.py
@@ -6,6 +6,8 @@
     BertPreTrainedModel,
     LongformerModel,
     PreTrainedModel,
+    AutoModelForTokenClassification,
+    # PretrainedModel,
 )
 from transformers.modeling_outputs import TokenClassifierOutput
 import collections
@@ -13,17 +15,20 @@
 from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel
 
 
-class AutoCRFforTokenClassification(BertPreTrainedModel):
+class AutoCRFforTokenClassification(AutoModelForTokenClassification):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.base_model = AutoModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         # self.crf= nn.Linear(config.num_labels,1)
         # self.crf= ConditionalRandomField(self.num_labels)
         self.crf = ConditionalRandomField(
-            self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"}
+            self.num_labels,
+            label_encoding="BIO",
+            idx2tag={0: "B", 1: "I", 2: "0"},
+            include_start_end_transitions=False,
         )
         self.init_weights()
 
@@ -89,7 +94,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         # self.crf= nn.Linear(config.num_labels,1)
         # self.crf= ConditionalRandomField(self.num_labels)
@@ -142,17 +147,17 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def freeze_till_clf(self):
-        for param in self.bert.parameters():
-            param.requires_grad = False
-        for param in self.dropout.parameters():
-            param.requires_grad = False
-        for param in self.classifier.parameters():
-            param.requires_grad = False
-
-    def freeze_encoder_layer(self):
-        for param in self.bert.parameters():
-            param.requires_grad = False
+    # def freeze_till_clf(self):
+    #     for param in self.bert.parameters():
+    #         param.requires_grad = False
+    #     for param in self.dropout.parameters():
+    #         param.requires_grad = False
+    #     for param in self.classifier.parameters():
+    #         param.requires_grad = False
+
+    # def freeze_encoder_layer(self):
+    #     for param in self.bert.parameters():
+    #         param.requires_grad = False
 
 
 class Longformer_CRFforTokenClassification(LongformerPreTrainedModel):
diff --git a/run_auto_token_ke.py b/run_auto_token_ke.py
index cb06a20..7251c75 100644
--- a/run_auto_token_ke.py
+++ b/run_auto_token_ke.py
@@ -3,12 +3,12 @@
 from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments
 
 training_args = TrainingArguments(
-    output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_debug",  # todo
+    output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_crf_debug",  # todo
     learning_rate=3e-5,
     overwrite_output_dir=True,
     num_train_epochs=5,
-    per_device_train_batch_size=4,
-    per_device_eval_batch_size=16,
+    per_device_train_batch_size=2,
+    per_device_eval_batch_size=2,
     # gradient_accumulation_steps=4,
     do_train=True,
     do_eval=True,
@@ -21,10 +21,10 @@
     # weight_decay =0.001
 )
 mdl_args = ModelArguments(
-    model_family_name="auto", model_name_or_path="roberta-base", use_CRF=False
+    model_family_name="auto", model_name_or_path="roberta-base", use_CRF=True
 )
 data_args = DataTrainingArguments(
-    task_name="token",
+    task_name="crf",
     # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json",
     # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json",
     dataset_name="midas/inspec",

From f1b8dc933a0855a7bc11312d23d7ded12e2ea2b4 Mon Sep 17 00:00:00 2001
From: Amardeep Kumar <kumaramardipsingh@gmail.com>
Date: Sun, 6 Feb 2022 23:43:49 +0530
Subject: [PATCH 15/17] reformatting and refactoring

---
 dlkp/models/ke/extraction_utils.py       |  1 +
 dlkp/models/ke/kpe.py                    |  9 ++++++---
 dlkp/models/ke/transformer/crf_models.py |  2 --
 run_auto_token_ke.py => run_auto_ke.py   | 21 ++++++++++++---------
 4 files changed, 19 insertions(+), 14 deletions(-)
 rename run_auto_token_ke.py => run_auto_ke.py (70%)

diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py
index cf913be..0901e0b 100644
--- a/dlkp/models/ke/extraction_utils.py
+++ b/dlkp/models/ke/extraction_utils.py
@@ -3,6 +3,7 @@
 import sys
 from dataclasses import dataclass, field
 from typing import Optional
+from transformers import TrainingArguments
 
 
 @dataclass
diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index 06dbaeb..a2078a9 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -211,9 +211,12 @@ def get_label_list(labels):
         use_fast=True,
         add_prefix_space=True,
     )
-    model = MODEL_DICT[data_args.task_name][
-        model_args.model_family_name
-    ].from_pretrained(
+    model = (
+        AutoCRFforTokenClassification
+        if model_args.use_CRF
+        else AutoModelForTokenClassification
+    )
+    model = model.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
index 5d2d4db..7cf7810 100644
--- a/dlkp/models/ke/transformer/crf_models.py
+++ b/dlkp/models/ke/transformer/crf_models.py
@@ -22,8 +22,6 @@ def __init__(self, config):
         self.base_model = AutoModel(config)
         # self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        # self.crf= nn.Linear(config.num_labels,1)
-        # self.crf= ConditionalRandomField(self.num_labels)
         self.crf = ConditionalRandomField(
             self.num_labels,
             label_encoding="BIO",
diff --git a/run_auto_token_ke.py b/run_auto_ke.py
similarity index 70%
rename from run_auto_token_ke.py
rename to run_auto_ke.py
index 7251c75..84ce2b5 100644
--- a/run_auto_token_ke.py
+++ b/run_auto_ke.py
@@ -1,14 +1,18 @@
 from statistics import mode
-from dlkp.models.ke.kpe import run_kpe, TrainingArguments
-from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments
+from dlkp.models.ke.kpe import run_kpe
+from dlkp.models.ke.extraction_utils import (
+    DataTrainingArguments,
+    ModelArguments,
+    TrainingArguments,
+)
 
 training_args = TrainingArguments(
     output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_crf_debug",  # todo
     learning_rate=3e-5,
     overwrite_output_dir=True,
     num_train_epochs=5,
-    per_device_train_batch_size=2,
-    per_device_eval_batch_size=2,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
     # gradient_accumulation_steps=4,
     do_train=True,
     do_eval=True,
@@ -20,11 +24,8 @@
     logging_steps=100
     # weight_decay =0.001
 )
-mdl_args = ModelArguments(
-    model_family_name="auto", model_name_or_path="roberta-base", use_CRF=True
-)
+model_args = ModelArguments(model_name_or_path="roberta-base", use_CRF=True)
 data_args = DataTrainingArguments(
-    task_name="crf",
     # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json",
     # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json",
     dataset_name="midas/inspec",
@@ -36,4 +37,6 @@
     # return_entity_level_metrics=True,
 )
 
-run_kpe(mdl_args, data_args, training_args)
+run_kpe(model_args, data_args, training_args)
+
+# CUDA_VISIBLE_DEVICES=0 python run_auto_ke.py

From e19ae34df8fdbed4f7b8c7ae1a0fd258c0ceabe9 Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Thu, 10 Feb 2022 12:29:25 +0530
Subject: [PATCH 16/17] add relative path

---
 dlkp/models/ke/kpe.py                    | 8 ++++----
 dlkp/models/ke/transformer/crf_models.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py
index a2078a9..996bfd3 100644
--- a/dlkp/models/ke/kpe.py
+++ b/dlkp/models/ke/kpe.py
@@ -45,17 +45,17 @@
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 
-from dlkp.models.ke.transformer.crf_models import (
+from .transformer.crf_models import (
     BERT_CRFforTokenClassification,
     AutoCRFforTokenClassification,
 )
-from dlkp.models.ke.transformer.token_classification_models import (
+from .transformer.token_classification_models import (
     LongformerForTokenClassification,
 )
-from dlkp.models.ke.crf.crf_trainer import CRF_Trainer
+from .crf.crf_trainer import CRF_Trainer
 
 # from extraction_utils import ModelArguments, DataTrainingArguments
-from dlkp.kp_metrics.metrics import compute_metrics
+from ...kp_metrics.metrics import compute_metrics
 
 logger = logging.getLogger(__name__)
 
diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py
index 7cf7810..75f2fff 100644
--- a/dlkp/models/ke/transformer/crf_models.py
+++ b/dlkp/models/ke/transformer/crf_models.py
@@ -11,8 +11,8 @@
 )
 from transformers.modeling_outputs import TokenClassifierOutput
 import collections
-from dlkp.models.ke.crf.crf import ConditionalRandomField
 from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel
+from ..crf.crf import ConditionalRandomField
 
 
 class AutoCRFforTokenClassification(AutoModelForTokenClassification):

From 7eda3011be9accca0c61546a707d2d33bfef9729 Mon Sep 17 00:00:00 2001
From: Amardeep <kumaramardipsingh@gmail.com>
Date: Fri, 11 Feb 2022 19:44:00 +0530
Subject: [PATCH 17/17] add setup.py for package

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 8441906..f9ab705 100644
--- a/setup.py
+++ b/setup.py
@@ -9,8 +9,8 @@
     description="A deep learning library for keyphrase extraction and generation",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
-    author="Debanjan Mahata",
-    author_email="debanjanmahata85@gmail.com",
+    author="Amardeep Kumar || Debanjan Mahata",
+    author_email="Kumaramardipsingh@gmail.com || debanjanmahata85@gmail.com",
     url="https://github.com/midas-research/dlkp",
     packages=find_packages(exclude="tests"),  # same as name
     license="Apache License Version 2.0",