From 0037308174a5a7e8eeb117e7beb4f7eaa875e3c9 Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Sat, 22 Jan 2022 13:29:17 +0530 Subject: [PATCH 01/17] add KPT code --- dlkp/models/ke/crf/crf.py | 283 ++++++++++++++ dlkp/models/ke/crf/crf_trainer.py | 189 ++++++++++ dlkp/models/ke/crf/crf_utils.py | 346 ++++++++++++++++++ dlkp/models/ke/extraction_utils.py | 0 dlkp/models/ke/kpe.py | 0 dlkp/models/ke/transformer/crf_models.py | 0 .../token_classification_models.py | 0 7 files changed, 818 insertions(+) create mode 100644 dlkp/models/ke/crf/crf.py create mode 100644 dlkp/models/ke/crf/crf_trainer.py create mode 100644 dlkp/models/ke/crf/crf_utils.py create mode 100644 dlkp/models/ke/extraction_utils.py create mode 100644 dlkp/models/ke/kpe.py create mode 100644 dlkp/models/ke/transformer/crf_models.py create mode 100644 dlkp/models/ke/transformer/token_classification_models.py diff --git a/dlkp/models/ke/crf/crf.py b/dlkp/models/ke/crf/crf.py new file mode 100644 index 0000000..8d5bd30 --- /dev/null +++ b/dlkp/models/ke/crf/crf.py @@ -0,0 +1,283 @@ +# add models having crf classification layer with option of bilstm layers + +from crf_utils import * +from typing import List, Tuple, Dict, Union + +import torch + +VITERBI_DECODING = Tuple[List[int], float] + +class ConditionalRandomField(torch.nn.Module): + """ + This module uses the "forward-backward" algorithm to compute + the log-likelihood of its inputs assuming a conditional random field model. + See, e.g. http://www.cs.columbia.edu/~mcollins/fb.pdf + # Parameters + num_tags : `int`, required + The number of tags. + constraints : `List[Tuple[int, int]]`, optional (default = `None`) + An optional list of allowed transitions (from_tag_id, to_tag_id). + These are applied to `viterbi_tags()` but do not affect `forward()`. + These should be derived from `allowed_transitions` so that the + start and end transitions are handled correctly for your tag type. + include_start_end_transitions : `bool`, optional (default = `True`) + Whether to include the start and end transition parameters. + """ + + + def __init__( + self, + num_tags: int, + label_encoding, + idx2tag, + include_start_end_transitions: bool = True, + ) -> None: + super().__init__() + self.num_tags = num_tags + constraints = allowed_transitions(label_encoding, idx2tag) + # transitions[i, j] is the logit for transitioning from state i to state j. + self.transitions = torch.nn.Parameter(torch.Tensor(num_tags, num_tags)) + + # _constraint_mask indicates valid transitions (based on supplied constraints). + # Include special start of sequence (num_tags + 1) and end of sequence tags (num_tags + 2) + if constraints is None: + # All transitions are valid. + constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(1.0) + else: + constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(0.0) + for i, j in constraints: + constraint_mask[i, j] = 1.0 + + self._constraint_mask = torch.nn.Parameter(constraint_mask, requires_grad=False) + + # Also need logits for transitioning from "start" state and to "end" state. + self.include_start_end_transitions = include_start_end_transitions + if include_start_end_transitions: + self.start_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) + self.end_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) + + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.xavier_normal_(self.transitions) + if self.include_start_end_transitions: + torch.nn.init.normal_(self.start_transitions) + torch.nn.init.normal_(self.end_transitions) + + def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: + """ + Computes the (batch_size,) denominator term for the log-likelihood, which is the + sum of the likelihoods across all possible state sequences. + """ + batch_size, sequence_length, num_tags = logits.size() + + # Transpose batch size and sequence dimensions + mask = mask.transpose(0, 1).contiguous() + logits = logits.transpose(0, 1).contiguous() + + # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the + # transitions to the initial states and the logits for the first timestep. + if self.include_start_end_transitions: + alpha = self.start_transitions.view(1, num_tags) + logits[0] + else: + alpha = logits[0] + + # For each i we compute logits for the transitions from timestep i-1 to timestep i. + # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are + # (instance, current_tag, next_tag) + for i in range(1, sequence_length): + # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis. + emit_scores = logits[i].view(batch_size, 1, num_tags) + # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis. + transition_scores = self.transitions.view(1, num_tags, num_tags) + # Alpha is for the current_tag, so we broadcast along the next_tag axis. + broadcast_alpha = alpha.view(batch_size, num_tags, 1) + + # Add all the scores together and logexp over the current_tag axis. + inner = broadcast_alpha + emit_scores + transition_scores + + # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension + # of `inner`. Otherwise (mask == False) we want to retain the previous alpha. + alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * ( + ~mask[i] + ).view(batch_size, 1) + + # Every sequence needs to end with a transition to the stop_tag. + if self.include_start_end_transitions: + stops = alpha + self.end_transitions.view(1, num_tags) + else: + stops = alpha + + # Finally we log_sum_exp along the num_tags dim, result is (batch_size,) + return logsumexp(stops) + + def _joint_likelihood( + self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor + ) -> torch.Tensor: + """ + Computes the numerator term for the log-likelihood, which is just score(inputs, tags) + """ + batch_size, sequence_length, _ = logits.data.shape + + # Transpose batch size and sequence dimensions: + logits = logits.transpose(0, 1).contiguous() + mask = mask.transpose(0, 1).contiguous() + tags = tags.transpose(0, 1).contiguous() + + # Start with the transition scores from start_tag to the first tag in each input + if self.include_start_end_transitions: + score = self.start_transitions.index_select(0, tags[0]) + else: + score = 0.0 + + # Add up the scores for the observed transitions and all the inputs but the last + # print(mask.shape, tags.shape, logits.shape, sequence_length) + for i in range(sequence_length - 1): + # Each is shape (batch_size,) + current_tag, next_tag = tags[i], tags[i + 1] + # print(current_tag, next_tag) + # print("tags printiiinggggg") + # print(current_tag, next_tag) + # The scores for transitioning from current_tag to next_tag + transition_score = self.transitions[current_tag.view(-1), next_tag.view(-1)] + + # The score for using current_tag + emit_score = logits[i].gather(1, current_tag.view(batch_size, 1)).squeeze(1) + # emit_score= 0 + # Include transition score if next element is unmasked, + # input_score if this element is unmasked. + score = score + transition_score * mask[i + 1] + emit_score * mask[i] + + # Transition from last state to "stop" state. To start with, we need to find the last tag + # for each instance. + last_tag_index = mask.sum(0).long() - 1 + last_tags = tags.gather(0, last_tag_index.view(1, batch_size)).squeeze(0) + + # Compute score of transitioning to `stop_tag` from each "last tag". + if self.include_start_end_transitions: + last_transition_score = self.end_transitions.index_select(0, last_tags) + else: + last_transition_score = 0.0 + + # Add the last input if it's not masked. + last_inputs = logits[-1] # (batch_size, num_tags) + last_input_score = last_inputs.gather(1, last_tags.view(-1, 1)) # (batch_size, 1) + last_input_score = last_input_score.squeeze() # (batch_size,) + + score = score + last_transition_score + last_input_score * mask[-1] + + return score + + def forward( + self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None + ) -> torch.Tensor: + """ + Computes the log likelihood. + """ + # mask[tags==-100]=0 + if mask is None: + mask = torch.ones(*tags.size(), dtype=torch.bool) + else: + # The code below fails in weird ways if this isn't a bool tensor, so we make sure. + mask = mask.to(torch.bool) + # print("forward",inputs.shape, tags.shape, mask.shape) + + log_denominator = self._input_likelihood(inputs, mask) + # temp_tags= tags + # tags[tags==-100]=2 + # print(tags[0]) + log_numerator = self._joint_likelihood(inputs, tags, mask) + # tags[mask==0]=-100 + return torch.sum(log_numerator - log_denominator) + + def viterbi_tags( + self, logits: torch.Tensor, mask: torch.BoolTensor = None, top_k: int = None + ) -> Union[List[VITERBI_DECODING], List[List[VITERBI_DECODING]]]: + """ + Uses viterbi algorithm to find most likely tags for the given inputs. + If constraints are applied, disallows all other transitions. + Returns a list of results, of the same size as the batch (one result per batch member) + Each result is a List of length top_k, containing the top K viterbi decodings + Each decoding is a tuple (tag_sequence, viterbi_score) + For backwards compatibility, if top_k is None, then instead returns a flat list of + tag sequences (the top tag sequence for each batch item). + """ + if mask is None: + mask = torch.ones(*logits.shape[:2], dtype=torch.bool, device=logits.device) + + if top_k is None: + top_k = 1 + flatten_output = True + else: + flatten_output = False + + _, max_seq_length, num_tags = logits.size() + + # Get the tensors out of the variables + logits, mask = logits.data, mask.data + + # Augment transitions matrix with start and end transitions + start_tag = num_tags + end_tag = num_tags + 1 + transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0) + + # Apply transition constraints + constrained_transitions = self.transitions * self._constraint_mask[ + :num_tags, :num_tags + ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags]) + transitions[:num_tags, :num_tags] = constrained_transitions.data + + if self.include_start_end_transitions: + transitions[ + start_tag, :num_tags + ] = self.start_transitions.detach() * self._constraint_mask[ + start_tag, :num_tags + ].data + -10000.0 * ( + 1 - self._constraint_mask[start_tag, :num_tags].detach() + ) + transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[ + :num_tags, end_tag + ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach()) + else: + transitions[start_tag, :num_tags] = -10000.0 * ( + 1 - self._constraint_mask[start_tag, :num_tags].detach() + ) + transitions[:num_tags, end_tag] = -10000.0 * ( + 1 - self._constraint_mask[:num_tags, end_tag].detach() + ) + + best_paths = [] + # Pad the max sequence length by 2 to account for start_tag + end_tag. + tag_sequence = torch.Tensor(max_seq_length + 2, num_tags + 2) + + for prediction, prediction_mask in zip(logits, mask): + mask_indices = prediction_mask.nonzero(as_tuple=False).squeeze() + masked_prediction = torch.index_select(prediction, 0, mask_indices) + sequence_length = masked_prediction.shape[0] + + # Start with everything totally unlikely + tag_sequence.fill_(-10000.0) + # At timestep 0 we must have the START_TAG + tag_sequence[0, start_tag] = 0.0 + # At steps 1, ..., sequence_length we just use the incoming prediction + tag_sequence[1 : (sequence_length + 1), :num_tags] = masked_prediction + # And at the last timestep we must have the END_TAG + tag_sequence[sequence_length + 1, end_tag] = 0.0 + + # We pass the tags and the transitions to `viterbi_decode`. + viterbi_paths, viterbi_scores = viterbi_decode( + tag_sequence=tag_sequence[: (sequence_length + 2)], + transition_matrix=transitions, + top_k=top_k, + ) + top_k_paths = [] + for viterbi_path, viterbi_score in zip(viterbi_paths, viterbi_scores): + # Get rid of START and END sentinels and append. + viterbi_path = viterbi_path[1:-1] + top_k_paths.append((viterbi_path, viterbi_score.item())) + best_paths.append(top_k_paths) + + if flatten_output: + return [top_k_paths[0] for top_k_paths in best_paths] + + return best_paths \ No newline at end of file diff --git a/dlkp/models/ke/crf/crf_trainer.py b/dlkp/models/ke/crf/crf_trainer.py new file mode 100644 index 0000000..d9b22df --- /dev/null +++ b/dlkp/models/ke/crf/crf_trainer.py @@ -0,0 +1,189 @@ +from transformers import ( + + Trainer, + set_seed, + +) +from transformers.trainer import * +from transformers.trainer_utils import PredictionOutput +from torch import nn +from torch.utils.data.dataloader import DataLoader +# from torch.utils.data.dataset import Dataset +# from typing import Any, Callable, Dict, List, Optional, Tuple, Union +class CRF_Trainer(Trainer): + def prediction_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> PredictionOutput: + """ + Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. + + Works both with or without labels. + """ + if not isinstance(dataloader.dataset, collections.abc.Sized): + raise ValueError("dataset must implement __len__") + prediction_loss_only = ( + prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only + ) + + if self.args.deepspeed and not self.args.do_train: + # no harm, but flagging to the user that deepspeed config is ignored for eval + # flagging only for when --do_train wasn't passed as only then it's redundant + logger.info("Detected the deepspeed argument but it will not be used for evaluation") + + model = self._wrap_model(self.model, training=False) + + # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while + # ``train`` is running, half it first and then put on device + if not self.is_in_train and self.args.fp16_full_eval: + model = model.half().to(self.args.device) + + batch_size = dataloader.batch_size + num_examples = self.num_examples(dataloader) + logger.info(f"***** Running {description} *****") + logger.info(f" Num examples = {num_examples}") + logger.info(f" Batch size = {batch_size}") + losses_host: torch.Tensor = None + preds_host: Union[torch.Tensor, List[torch.Tensor]] = None + labels_host: Union[torch.Tensor, List[torch.Tensor]] = None + + world_size = max(1, self.args.world_size) + + eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) + if not prediction_loss_only: + # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass + # a batch size to the sampler) + make_multiple_of = None + if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler): + make_multiple_of = dataloader.sampler.batch_size + preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + if self.args.past_index >= 0: + self._past = None + model.eval() + + if is_torch_tpu_available(): + dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) + + + + self.callback_handler.eval_dataloader = dataloader + + for step, inputs in enumerate(dataloader): + + loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + + best_path= self.eval_step(model, logits, inputs['attention_mask']) + # best_path= self.eval_step(model, logits) + # print(len(best_path), best_path[0]) + # logits= torch.zeros() + + best_path= [x for x,_ in best_path] + # print(best_path) + # seq_len= labels.shape[1] + logits*=0 + for i,path in enumerate(best_path): + # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1]) + # print(len(x)) + for j, tag in enumerate(path): + logits[i,j,int(tag)]=1 + # print(inputs['attention_mask'][i,j], labels[i,j]) + + # logits= torch.tensor(data=best_path, dtype= labels.dtype, device= labels.device) + # if(logits.shape!=labels.shape): + # print(logits.shape,labels.shape) + # assert logits.shape==labels.shape + if loss is not None: + losses = loss.repeat(batch_size) + losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) + if logits is not None: + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + if labels is not None: + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) + + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: + eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) + if not prediction_loss_only: + preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) + labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + + # Set back to None to begin a new accumulation + losses_host, preds_host, labels_host = None, None, None + + if self.args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) + if not prediction_loss_only: + preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) + labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + + eval_loss = eval_losses_gatherer.finalize() + preds = preds_gatherer.finalize() if not prediction_loss_only else None + label_ids = labels_gatherer.finalize() if not prediction_loss_only else None + + if self.compute_metrics is not None and preds is not None and label_ids is not None: + metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) + else: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if eval_loss is not None: + metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) + def eval_step(self, + model: nn.Module, + logits, + mask= None, + top_k= None + + ): + with torch.no_grad(): + output= model.crf.viterbi_tags(logits, mask, top_k) + + return output + + + def compute_loss(self, model, inputs, return_outputs=False): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + + Subclass and override for custom behavior. + """ + # if self.label_smoother is not None and "labels" in inputs: + # labels = inputs.pop("labels") + # else: + labels = None + # print(model) + # assert "labels" in inputs + # print(type(inputs),inputs) + outputs = model(**inputs) + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + loss = self.label_smoother(outputs, labels) + else: + # We don't use .loss here since the model may return tuples instead of ModelOutput. + # print(outputs) + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + # print("loss is ", loss) + return (loss, outputs) if return_outputs else loss diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py new file mode 100644 index 0000000..295aeef --- /dev/null +++ b/dlkp/models/ke/crf/crf_utils.py @@ -0,0 +1,346 @@ +""" +Conditional random field utilis file +""" +from typing import List, Tuple, Dict, Union + +import torch + +# from allennlp.common.checks import ConfigurationError +# import allennlp.nn.util as util + +VITERBI_DECODING = Tuple[List[int], float] # a list of tags, and a viterbi score + + +def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]: + """ + Given labels and a constraint type, returns the allowed transitions. It will + additionally include transitions for the start and end states, which are used + by the conditional random field. + # Parameters + constraint_type : `str`, required + Indicates which constraint to apply. Current choices are + "BIO", "IOB1", "BIOUL", and "BMES". + labels : `Dict[int, str]`, required + A mapping {label_id -> label}. Most commonly this would be the value from + Vocabulary.get_index_to_token_vocabulary() + # Returns + `List[Tuple[int, int]]` + The allowed transitions (from_label_id, to_label_id). + """ + num_labels = len(labels) + start_tag = num_labels + end_tag = num_labels + 1 + labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")] + + allowed = [] + for from_label_index, from_label in labels_with_boundaries: + if from_label in ("START", "END"): + from_tag = from_label + from_entity = "" + else: + from_tag = from_label[0] + from_entity = from_label[1:] + for to_label_index, to_label in labels_with_boundaries: + if to_label in ("START", "END"): + to_tag = to_label + to_entity = "" + else: + to_tag = to_label[0] + to_entity = to_label[1:] + if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity): + allowed.append((from_label_index, to_label_index)) + return allowed + + +def is_transition_allowed( + constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str +): + """ + Given a constraint type and strings `from_tag` and `to_tag` that + represent the origin and destination of the transition, return whether + the transition is allowed under the given constraint type. + # Parameters + constraint_type : `str`, required + Indicates which constraint to apply. Current choices are + "BIO", "IOB1", "BIOUL", and "BMES". + from_tag : `str`, required + The tag that the transition originates from. For example, if the + label is `I-PER`, the `from_tag` is `I`. + from_entity : `str`, required + The entity corresponding to the `from_tag`. For example, if the + label is `I-PER`, the `from_entity` is `PER`. + to_tag : `str`, required + The tag that the transition leads to. For example, if the + label is `I-PER`, the `to_tag` is `I`. + to_entity : `str`, required + The entity corresponding to the `to_tag`. For example, if the + label is `I-PER`, the `to_entity` is `PER`. + # Returns + `bool` + Whether the transition is allowed under the given `constraint_type`. + """ + + if to_tag == "START" or from_tag == "END": + # Cannot transition into START or from END + return False + + if constraint_type == "BIOUL": + if from_tag == "START": + return to_tag in ("O", "B", "U") + if to_tag == "END": + return from_tag in ("O", "L", "U") + return any( + [ + # O can transition to O, B-* or U-* + # L-x can transition to O, B-*, or U-* + # U-x can transition to O, B-*, or U-* + from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"), + # B-x can only transition to I-x or L-x + # I-x can only transition to I-x or L-x + from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity, + ] + ) + elif constraint_type == "BIO": + if from_tag == "START": + return to_tag in ("O", "B") + if to_tag == "END": + return from_tag in ("O", "B", "I") + return any( + [ + # Can always transition to O or B-x + to_tag in ("O", "B"), + # Can only transition to I-x from B-x or I-x + to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity, + ] + ) + elif constraint_type == "IOB1": + if from_tag == "START": + return to_tag in ("O", "I") + if to_tag == "END": + return from_tag in ("O", "B", "I") + return any( + [ + # Can always transition to O or I-x + to_tag in ("O", "I"), + # Can only transition to B-x from B-x or I-x, where + # x is the same tag. + to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity, + ] + ) + elif constraint_type == "BMES": + if from_tag == "START": + return to_tag in ("B", "S") + if to_tag == "END": + return from_tag in ("E", "S") + return any( + [ + # Can only transition to B or S from E or S. + to_tag in ("B", "S") and from_tag in ("E", "S"), + # Can only transition to M-x from B-x, where + # x is the same tag. + to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity, + # Can only transition to E-x from B-x or M-x, where + # x is the same tag. + to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity, + ] + ) + else: + print("error in constrint type") + + +def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor: + """ + A numerically stable computation of logsumexp. This is mathematically equivalent to + `tensor.exp().sum(dim, keep=keepdim).log()`. This function is typically used for summing log + probabilities. + # Parameters + tensor : `torch.FloatTensor`, required. + A tensor of arbitrary size. + dim : `int`, optional (default = `-1`) + The dimension of the tensor to apply the logsumexp to. + keepdim: `bool`, optional (default = `False`) + Whether to retain a dimension of size one at the dimension we reduce over. + """ + max_score, _ = tensor.max(dim, keepdim=keepdim) + if keepdim: + stable_vec = tensor - max_score + else: + stable_vec = tensor - max_score.unsqueeze(dim) + return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log() + + + + +def viterbi_decode( + tag_sequence: torch.Tensor, + transition_matrix: torch.Tensor, + tag_observations: Optional[List[int]] = None, + allowed_start_transitions: torch.Tensor = None, + allowed_end_transitions: torch.Tensor = None, + top_k: int = None, +): + """ + Perform Viterbi decoding in log space over a sequence given a transition matrix + specifying pairwise (transition) potentials between tags and a matrix of shape + (sequence_length, num_tags) specifying unary potentials for possible tags per + timestep. + # Parameters + tag_sequence : `torch.Tensor`, required. + A tensor of shape (sequence_length, num_tags) representing scores for + a set of tags over a given sequence. + transition_matrix : `torch.Tensor`, required. + A tensor of shape (num_tags, num_tags) representing the binary potentials + for transitioning between a given pair of tags. + tag_observations : `Optional[List[int]]`, optional, (default = `None`) + A list of length `sequence_length` containing the class ids of observed + elements in the sequence, with unobserved elements being set to -1. Note that + it is possible to provide evidence which results in degenerate labelings if + the sequences of tags you provide as evidence cannot transition between each + other, or those transitions are extremely unlikely. In this situation we log a + warning, but the responsibility for providing self-consistent evidence ultimately + lies with the user. + allowed_start_transitions : `torch.Tensor`, optional, (default = `None`) + An optional tensor of shape (num_tags,) describing which tags the START token + may transition *to*. If provided, additional transition constraints will be used for + determining the start element of the sequence. + allowed_end_transitions : `torch.Tensor`, optional, (default = `None`) + An optional tensor of shape (num_tags,) describing which tags may transition *to* the + end tag. If provided, additional transition constraints will be used for determining + the end element of the sequence. + top_k : `int`, optional, (default = `None`) + Optional integer specifying how many of the top paths to return. For top_k>=1, returns + a tuple of two lists: top_k_paths, top_k_scores, For top_k==None, returns a flattened + tuple with just the top path and its score (not in lists, for backwards compatibility). + # Returns + viterbi_path : `List[int]` + The tag indices of the maximum likelihood tag sequence. + viterbi_score : `torch.Tensor` + The score of the viterbi path. + """ + if top_k is None: + top_k = 1 + flatten_output = True + elif top_k >= 1: + flatten_output = False + else: + raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}") + + sequence_length, num_tags = list(tag_sequence.size()) + + has_start_end_restrictions = ( + allowed_end_transitions is not None or allowed_start_transitions is not None + ) + + if has_start_end_restrictions: + + if allowed_end_transitions is None: + allowed_end_transitions = torch.zeros(num_tags) + if allowed_start_transitions is None: + allowed_start_transitions = torch.zeros(num_tags) + + num_tags = num_tags + 2 + new_transition_matrix = torch.zeros(num_tags, num_tags) + new_transition_matrix[:-2, :-2] = transition_matrix + + # Start and end transitions are fully defined, but cannot transition between each other. + + allowed_start_transitions = torch.cat( + [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])] + ) + allowed_end_transitions = torch.cat( + [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])] + ) + + # First define how we may transition FROM the start and end tags. + new_transition_matrix[-2, :] = allowed_start_transitions + # We cannot transition from the end tag to any tag. + new_transition_matrix[-1, :] = -math.inf + + new_transition_matrix[:, -1] = allowed_end_transitions + # We cannot transition to the start tag from any tag. + new_transition_matrix[:, -2] = -math.inf + + transition_matrix = new_transition_matrix + + if tag_observations: + if len(tag_observations) != sequence_length: + raise ConfigurationError( + "Observations were provided, but they were not the same length " + "as the sequence. Found sequence of length: {} and evidence: {}".format( + sequence_length, tag_observations + ) + ) + else: + tag_observations = [-1 for _ in range(sequence_length)] + + if has_start_end_restrictions: + tag_observations = [num_tags - 2] + tag_observations + [num_tags - 1] + zero_sentinel = torch.zeros(1, num_tags) + extra_tags_sentinel = torch.ones(sequence_length, 2) * -math.inf + tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1) + tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0) + sequence_length = tag_sequence.size(0) + + path_scores = [] + path_indices = [] + + if tag_observations[0] != -1: + one_hot = torch.zeros(num_tags) + one_hot[tag_observations[0]] = 100000.0 + path_scores.append(one_hot.unsqueeze(0)) + else: + path_scores.append(tag_sequence[0, :].unsqueeze(0)) + + # Evaluate the scores for all possible paths. + for timestep in range(1, sequence_length): + # Add pairwise potentials to current scores. + summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix + summed_potentials = summed_potentials.view(-1, num_tags) + + # Best pairwise potential path score from the previous timestep. + max_k = min(summed_potentials.size()[0], top_k) + scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) + + # If we have an observation for this timestep, use it + # instead of the distribution over tags. + observation = tag_observations[timestep] + # Warn the user if they have passed + # invalid/extremely unlikely evidence. + if tag_observations[timestep - 1] != -1 and observation != -1: + if transition_matrix[tag_observations[timestep - 1], observation] < -10000: + logger.warning( + "The pairwise potential between tags you have passed as " + "observations is extremely unlikely. Double check your evidence " + "or transition potentials!" + ) + if observation != -1: + one_hot = torch.zeros(num_tags) + one_hot[observation] = 100000.0 + path_scores.append(one_hot.unsqueeze(0)) + else: + path_scores.append(tag_sequence[timestep, :] + scores) + path_indices.append(paths.squeeze()) + + # Construct the most likely sequence backwards. + path_scores_v = path_scores[-1].view(-1) + max_k = min(path_scores_v.size()[0], top_k) + viterbi_scores, best_paths = torch.topk(path_scores_v, k=max_k, dim=0) + viterbi_paths = [] + for i in range(max_k): + viterbi_path = [best_paths[i]] + for backward_timestep in reversed(path_indices): + viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) + # Reverse the backward path. + viterbi_path.reverse() + + if has_start_end_restrictions: + viterbi_path = viterbi_path[1:-1] + + # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. + viterbi_path = [j % num_tags for j in viterbi_path] + viterbi_paths.append(viterbi_path) + + if flatten_output: + return viterbi_paths[0], viterbi_scores[0] + + return viterbi_paths, viterbi_scores \ No newline at end of file diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py new file mode 100644 index 0000000..e69de29 diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py new file mode 100644 index 0000000..e69de29 diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py new file mode 100644 index 0000000..e69de29 diff --git a/dlkp/models/ke/transformer/token_classification_models.py b/dlkp/models/ke/transformer/token_classification_models.py new file mode 100644 index 0000000..e69de29 From 0b9918a58d0f3c8e2b96c6090dc78db2867e74ef Mon Sep 17 00:00:00 2001 From: Amardeep Date: Sat, 22 Jan 2022 20:20:51 +0530 Subject: [PATCH 02/17] add notebooks from KPT --- notebooks/klm_preprocess.ipynb | 702 +++++++++++++++++++++++++++++ notebooks/tranKP.ipynb | 801 +++++++++++++++++++++++++++++++++ 2 files changed, 1503 insertions(+) create mode 100644 notebooks/klm_preprocess.ipynb create mode 100644 notebooks/tranKP.ipynb diff --git a/notebooks/klm_preprocess.ipynb b/notebooks/klm_preprocess.ipynb new file mode 100644 index 0000000..a8fa5e1 --- /dev/null +++ b/notebooks/klm_preprocess.ipynb @@ -0,0 +1,702 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "klm_preprocess.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LEHkPwSWLBBD", + "outputId": "2adf486e-ff7b-4750-ec88-b8d0933f951d" + }, + "source": [ + "!pip install transformers\n", + "!pip install datasets\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (4.1.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", + "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.43)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.8)\n", + "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", + "Requirement already satisfied: tokenizers==0.9.4 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.9.4)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.19.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.12.5)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.0.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n", + "Requirement already satisfied: datasets in /usr/local/lib/python3.6/dist-packages (1.2.0)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from datasets) (0.3.3)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.6/dist-packages (from datasets) (1.19.4)\n", + "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from datasets) (0.8)\n", + "Requirement already satisfied: pyarrow>=0.17.1 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.0.0)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.6/dist-packages (from datasets) (0.70.11.1)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from datasets) (1.1.5)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.6/dist-packages (from datasets) (2.0.0)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.23.0)\n", + "Requirement already satisfied: tqdm<4.50.0,>=4.27 in /usr/local/lib/python3.6/dist-packages (from datasets) (4.41.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2018.9)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2.8.1)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2020.12.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pSvgoduMLGEp" + }, + "source": [ + "from transformers import RobertaForMaskedLM\n", + "from transformers import RobertaTokenizer, PreTrainedTokenizer\n", + "from transformers import RobertaConfig" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YoVheMQrnEmb" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "C3oHJencLGMb" + }, + "source": [ + "#config\n", + "config = RobertaConfig(\n", + " vocab_size=52_000,\n", + " max_position_embeddings=514,\n", + " num_attention_heads=12,\n", + " num_hidden_layers=6,\n", + " type_vocab_size=1,\n", + ")\n", + "\n", + "#model roberta\n", + "model = RobertaForMaskedLM(config=config)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7yxtwm9xMYN3" + }, + "source": [ + "**Code requirement**\n", + "\n", + "\n", + "1. Dataset class :\n", + "\n", + " load and tokenize dataset->> input ids\n", + "\n", + " *look if nlp dataset library could be used here easily*\n", + "\n", + " tokenize key phrase as well as text and mask key phrase in data collator\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "2. Data collator for masked LM\n", + "\n", + " takes a list of samples from a Dataset and collate them into a batch for (also masking and stuffs)\n", + "\n", + " Refrence class: DataCollatorForWholeWordMask\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7kKlX2aRM0k2" + }, + "source": [ + "**Dataset class**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9vE_U_IuLGP9" + }, + "source": [ + "from torch.utils.data.dataset import Dataset\n", + "import json, os" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "g2CG9HfpNmLs" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jWl2QW0TNmRE" + }, + "source": [ + "class KLMDataset(Dataset):\n", + "\n", + " def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):\n", + " assert os.path.isfile(file_path)\n", + "\n", + " # logger.info(\"Creating features from dataset file at %s\", file_path)\n", + " self.abst= []\n", + " self.kps= []\n", + " with open(file_path, encoding=\"utf-8\") as f:\n", + " for line in f:\n", + " d=json.loads(line)\n", + " self.abst.append(d['text'])\n", + " self.kps.append(d['kp'])\n", + "\n", + " for (i,kp) in enumerate(self.kps):\n", + " self.kps[i]= tokenizer(kp,add_special_tokens= False, truncation= False)['input_ids']\n", + " \n", + "\n", + " self.abst = tokenizer(self.abst, add_special_tokens=True, truncation=True, max_length=block_size)[\"input_ids\"]\n", + " \n", + "\n", + " def __len__(self):\n", + " return len(self.abst)\n", + "\n", + " def __getitem__(self, i):\n", + " # print(\"called {} and results{}\\n\".format(i,{'input_ids': self.abst[i], 'kp': self.kps[i]}))\n", + " return {'input_ids': self.abst[i], 'kp': self.kps[i]}\n", + "\n", + "# super daset from HF\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xs-wm7RVNmUP" + }, + "source": [ + "tok= RobertaTokenizer.from_pretrained(\"roberta-base\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "991fGSu8hW-n" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-gLTyA_0hvMU" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YIXBbF_ZYqCF" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "f5NOBJhfNmXQ" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x_GyWRwqM5KI" + }, + "source": [ + "**Data collator**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "W5FcKIbkLGS2" + }, + "source": [ + "from transformers import DataCollatorForLanguageModeling\n", + "from dataclasses import dataclass\n", + "from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union\n", + "import torch\n", + "from transformers.data.data_collator import _collate_batch, tolist\n", + "import random" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xZMjWP8mLGVz" + }, + "source": [ + "@dataclass\n", + "class DataCollatorForKLM(DataCollatorForLanguageModeling):\n", + " def __init__(self, \n", + " tokenizer: PreTrainedTokenizer,\n", + " mlm_probability= 0.15,\n", + " kp_mask_percentage = 0.8):\n", + " self.tokenizer= tokenizer\n", + " self.mlm_probability= mlm_probability\n", + " self.kp_mask_percentage = kp_mask_percentage\n", + "\n", + " def __call__(\n", + " self, examples\n", + " ) -> Dict[str, torch.Tensor]:\n", + " print(\"collator \",examples)\n", + " if isinstance(examples[0], dict):\n", + " print(examples[0])\n", + " input_ids = [e[\"input_ids\"] for e in examples]\n", + " key_phrases= [e[\"labels\"] for e in examples]\n", + " else:\n", + " print(\"proper inputr fromat is not found for kp input ids\")\n", + " \n", + "\n", + " batch_input = _collate_batch(input_ids, self.tokenizer)\n", + "\n", + " mask_labels = []\n", + " kp_mask_labels= []\n", + " for e in examples:\n", + " ref_tokens = []\n", + " kp_tokens_list= []\n", + " for id in tolist(e[\"input_ids\"]):\n", + " token = self.tokenizer._convert_id_to_token(id)\n", + " ref_tokens.append(token)\n", + " for kp in tolist(e[\"labels\"]):\n", + " curr_kp= []\n", + " for kp_id in kp:\n", + " tok= self.tokenizer._convert_id_to_token(kp_id)\n", + " curr_kp.append(tok)\n", + " if len(curr_kp) >0:\n", + " kp_tokens_list.append(curr_kp)\n", + " mask_res= self.kp_and_whole_word_mask(ref_tokens, kp_tokens_list) #[[\"KP1-T1\", \"KP1-T2\"], [\"KP2-T1\", \"KP2-T2\", \"KP2-T3\"]] \n", + " mask_labels.append(mask_res[0])\n", + " kp_mask_labels.append(mask_res[1])\n", + " #collate\n", + " batch_mask = _collate_batch(mask_labels, self.tokenizer)\n", + " kp_batch_mask= _collate_batch(kp_mask_labels, self.tokenizer)\n", + " #mask\n", + " inputs, labels = self.mask_tokens_and_kp(batch_input, batch_mask, kp_batch_mask)\n", + "\n", + " return {\"input_ids\": inputs, \"labels\": labels}\n", + "\n", + " def kp_and_whole_word_mask(self, input_tokens, kp_tokens_list, max_predictions=512):\n", + " \"\"\"\n", + " Get 0/1 labels for masked tokens with whole word mask proxy\n", + " \"\"\"\n", + "\n", + " cand_indexes = []\n", + " kp_indexes= []\n", + " for (i, token) in enumerate(input_tokens):\n", + " if token == \"[CLS]\" or token == \"[SEP]\":\n", + " continue\n", + " kp_flag = False\n", + " for kp in kp_tokens_list: # kp = [\"KP1-T1\", \"KP1-T2\"]\n", + " j= i + len(kp)\n", + " if j < len(input_tokens):\n", + " if input_tokens[i:j]== kp: # input_tokens = [\"KP1-T1\", \"KP1-T2\"]\n", + " kp_indexes.append([x for x in range(i,j)]) # kp_indexes = [\"index of KP1-T1\", \"index of KP1-T2\"]\n", + " i=j-1\n", + " kp_flag= True\n", + " break\n", + " if kp_flag: #if token is included in kp mask then don't include in random token mask\n", + " continue\n", + " if len(cand_indexes) >= 1 and token.startswith(\"##\"):\n", + " cand_indexes[-1].append(i)\n", + " else:\n", + " cand_indexes.append([i])\n", + " \n", + " tok_to_predict= min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))\n", + " kp_to_predict= min(max_predictions, max(1, int(round(len(kp_tokens_list) * self.kp_mask_percentage))))\n", + "\n", + " tok_mask_labels= self.get_mask_labels(cand_indexes=cand_indexes, len_input_tokens=len(input_tokens), num_to_predict=tok_to_predict)\n", + " kp_mask_labels= self.get_mask_labels(cand_indexes=kp_indexes, len_input_tokens=len(input_tokens), num_to_predict=kp_to_predict)\n", + " return tok_mask_labels, kp_mask_labels\n", + "\n", + "\n", + " def get_mask_labels(self, cand_indexes, len_input_tokens, num_to_predict):\n", + " random.shuffle(cand_indexes)\n", + " masked_lms = []\n", + " covered_indexes = set()\n", + " for index_set in cand_indexes:\n", + " if len(masked_lms) >= num_to_predict:\n", + " break\n", + " # If adding a whole-word mask would exceed the maximum number of\n", + " # predictions, then just skip this candidate.\n", + " if len(masked_lms) + len(index_set) > num_to_predict:\n", + " continue\n", + " is_any_index_covered = False\n", + " for index in index_set:\n", + " if index in covered_indexes:\n", + " is_any_index_covered = True\n", + " break\n", + " if is_any_index_covered:\n", + " continue\n", + " for index in index_set:\n", + " covered_indexes.add(index)\n", + " masked_lms.append(index)\n", + "\n", + " assert len(covered_indexes) == len(masked_lms)\n", + " mask_labels = [1 if i in covered_indexes else 0 for i in range(len_input_tokens)]\n", + " return mask_labels\n", + "\n", + " def mask_tokens_and_kp(self, inputs, mask_labels, kp_mask_labels): \n", + " \"\"\"\n", + " Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set\n", + " 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.\n", + " \"\"\"\n", + "\n", + " if self.tokenizer.mask_token is None:\n", + " raise ValueError(\n", + " \"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer.\"\n", + " )\n", + " labels = inputs.clone()\n", + " # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)\n", + "\n", + " probability_matrix = mask_labels\n", + " kp_probability_matrix = kp_mask_labels\n", + "\n", + " special_tokens_mask = [\n", + " self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()\n", + " ]\n", + " # do zero for special tokens\n", + " probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)\n", + " kp_probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)\n", + "\n", + " # assert kp_probability_matrix & probability_matrix == 0\n", + " # do zero for padded points\n", + " if self.tokenizer._pad_token is not None:\n", + " padding_mask = labels.eq(self.tokenizer.pad_token_id)\n", + " probability_matrix.masked_fill_(padding_mask, value=0.0)\n", + " kp_probability_matrix.masked_fill_(padding_mask, value=0.0)\n", + "\n", + " masked_indices = probability_matrix.bool()\n", + " kp_masked_indices = kp_probability_matrix.bool()\n", + " # get the gold lables\n", + " labels[~(masked_indices | kp_masked_indices)] = -100 # We only compute loss on random masked tokens and kp masked token else is set to -100\n", + "\n", + " # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])\n", + " indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices\n", + " inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)\n", + " # 80 % masking for key phrases\n", + " kp_indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & kp_masked_indices\n", + " inputs[kp_indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)\n", + " # generate random tokens\n", + " random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)\n", + " # 10% of the time, we replace masked input tokens with random word\n", + " indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced\n", + " inputs[indices_random] = random_words[indices_random]\n", + "\n", + " # replace 10 # kp tokens with random idices\n", + " kp_indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & kp_masked_indices & ~kp_indices_replaced\n", + " inputs[kp_indices_random] = random_words[kp_indices_random]\n", + " # The rest of the time (10% of the time) we keep the masked input tokens unchanged\n", + " # print(\"inside mask tok functiom \\n\",inputs,\"\\n\", labels,\"\\n\")\n", + "\n", + " # generation - t1, t2, t3 (actual) - [MASK], t4 [MASK], t5, t6\n", + " # replacement - t1, t2, t3 (actual) - [MASK], t4 [MASK], t5, t6 (replace) t9\n", + " \n", + " return inputs, labels\n", + "\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "9azrJU6UuPFt" + }, + "source": [ + "from datasets import load_dataset\n", + "def load_klm_dataset(tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):\n", + " \n", + " def pre_process(d):\n", + " kp_pro= tokenizer(d['kp'],add_special_tokens= False, truncation= False)[\"input_ids\"]\n", + " d['input_ids']= tokenizer(d['text'], add_special_tokens=True, truncation=True, max_length=block_size)[\"input_ids\"]\n", + " d['labels'] = kp_pro\n", + " # print(\"inn inn\",d['kp'])\n", + " return d\n", + "\n", + "\n", + " dataset = load_dataset('json', data_files= file_path, split='train' )\n", + " dataset= dataset.map(pre_process)\n", + " # print(\"inn \", dataset)\n", + " dataset.set_format(columns=[ 'labels', 'input_ids'])\n", + "\n", + " return dataset\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "9d2Zn2PJvQhl" + }, + "source": [ + "# tok(['iam mam'],add_special_tokens= False, truncation= False)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qa4ArAI0kbs_", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "bcc7ec82-6a79-416b-83c4-a9bf904e2536" + }, + "source": [ + "\n", + "# data_set= KLMDataset(tokenizer=tok, file_path=\"/content/dummy.txt\", block_size= 200)\n", + "data_set = load_klm_dataset(tokenizer= tok, file_path= \"/content/train.json\", block_size= 124)\n", + "# data_set.set_format(columns=[ 'kp', 'input_ids'])\n", + "dc= DataCollatorForKLM(tokenizer= tok)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Using custom data configuration default\n", + "Reusing dataset json (/content/json/default-16dd99a81353c724/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n", + "Loading cached processed dataset at /content/json/default-16dd99a81353c724/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-6d89745262ae7664.arrow\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mEo-CRzkLGYQ" + }, + "source": [ + "from transformers import Trainer, TrainingArguments\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir=\"/content\",\n", + " overwrite_output_dir=True,\n", + " num_train_epochs=1,\n", + " per_gpu_train_batch_size=64,\n", + " save_steps=10_000,\n", + " save_total_limit=2, # need to save all the models\n", + ")\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "3rvp2Yk5LGa7" + }, + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=dc,\n", + " train_dataset= data_set\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Cv-vnqziLGda", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 201 + }, + "outputId": "21fccf71-22e9-4403-b8e0-2bd1f86d60fa" + }, + "source": [ + "trainer.train()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n", + "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n", + "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "collator [{'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}]\n", + "{'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " \n", + " [1/1 00:00, Epoch 1/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss

" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=1, training_loss=10.954537391662598)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 159 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "DDsc-Ld-LGgM" + }, + "source": [ + "# Data format - {\"text\": ....., \"keyphrases\": [{\"surface_form\": ..., \"start\": ..., \"end\": ...}]}\n", + "# format - jsonl one json per line\n", + "# dir - 1.jsonl, 2.jsonl, 3.jsonl" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ncOg07xPLGkN" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tC3T7Y35LGnQ" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/tranKP.ipynb b/notebooks/tranKP.ipynb new file mode 100644 index 0000000..7835b24 --- /dev/null +++ b/notebooks/tranKP.ipynb @@ -0,0 +1,801 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "tranKP.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HXDqBqrdaoNw", + "outputId": "4e45f4d4-324f-44a0-f289-62479ccd56ef" + }, + "source": [ + "!pip install transformers\n", + "!pip install sentencepiece\n", + "!pip install datasets" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting transformers\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)\n", + "\u001b[K |████████████████████████████████| 1.5MB 4.2MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.19.4)\n", + "Collecting sacremoses\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n", + "\u001b[K |████████████████████████████████| 890kB 17.1MB/s \n", + "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.8)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", + "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8)\n", + "Collecting tokenizers==0.9.4\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)\n", + "\u001b[K |████████████████████████████████| 2.9MB 21.5MB/s \n", + "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.0.0)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.12.5)\n", + "Building wheels for collected packages: sacremoses\n", + " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=5eee5bbfe2f9124d4f5d0c0332e4124d253f5989149682918253b6700d942717\n", + " Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n", + "Successfully built sacremoses\n", + "Installing collected packages: sacremoses, tokenizers, transformers\n", + "Successfully installed sacremoses-0.0.43 tokenizers-0.9.4 transformers-4.1.1\n", + "Collecting sentencepiece\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)\n", + "\u001b[K |████████████████████████████████| 1.1MB 5.9MB/s \n", + "\u001b[?25hInstalling collected packages: sentencepiece\n", + "Successfully installed sentencepiece-0.1.94\n", + "Collecting datasets\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ee/78/5873ac1e27bf25a2cbf3447d6704edd3136b1b3ff0eb3bfab38a45d2a1ff/datasets-1.2.0-py3-none-any.whl (159kB)\n", + "\u001b[K |████████████████████████████████| 163kB 4.1MB/s \n", + "\u001b[?25hRequirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from datasets) (0.8)\n", + "Requirement already satisfied: tqdm<4.50.0,>=4.27 in /usr/local/lib/python3.6/dist-packages (from datasets) (4.41.1)\n", + "Collecting xxhash\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)\n", + "\u001b[K |████████████████████████████████| 245kB 6.1MB/s \n", + "\u001b[?25hCollecting pyarrow>=0.17.1\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d7/e1/27958a70848f8f7089bff8d6ebe42519daf01f976d28b481e1bfd52c8097/pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7MB)\n", + "\u001b[K |████████████████████████████████| 17.7MB 1.5MB/s \n", + "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from datasets) (0.3.3)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.6/dist-packages (from datasets) (1.19.4)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.6/dist-packages (from datasets) (0.70.11.1)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.23.0)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from datasets) (1.1.5)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2020.12.5)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2018.9)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", + "Installing collected packages: xxhash, pyarrow, datasets\n", + " Found existing installation: pyarrow 0.14.1\n", + " Uninstalling pyarrow-0.14.1:\n", + " Successfully uninstalled pyarrow-0.14.1\n", + "Successfully installed datasets-1.2.0 pyarrow-2.0.0 xxhash-2.0.0\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "r3IcbMLKYCz7" + }, + "source": [ + "# utils\n", + "import os, sys\n", + "import argparse\n", + "from dataclasses import dataclass, field\n", + "from typing import Dict, List, Optional\n", + "@dataclass\n", + "class BasicKPArgs:\n", + " model_type : Optional[str] = field(\n", + " default=\"enc_dec\",\n", + " metadata= {\"help\": \"encoder decoder type or other generative model like Bart\"}\n", + " )\n", + "\n", + " model_name_path : Optional[str] = field(\n", + " default= None,\n", + " metadata= {\"help\": \"path or name to load pretrained model or from checkpoints\"}\n", + " )\n", + " decoder_model_name_path : Optional[str] = field(\n", + " default= None,\n", + " metadata= {\"help\": \"path or name of decoder part of the model in enc_dec architect\"}\n", + " )\n", + " tokenizer_path : Optional[str] = field(\n", + " default= None,\n", + " metadata= {\"help\": \"path or name of custom tokenizer saved if provided this tokenizer will be loaded else auto tokenizer\"}\n", + " )\n", + " data_dir : Optional[str] = field(\n", + " default= \"\",\n", + " metadata= {\"help\": \"path to dir containg data\"}\n", + " )\n", + " kp_task_type : Optional[str] = field(\n", + " default= \"one2one\",\n", + " metadata= {\"help\": \"wether to use one2one or one2many\"}\n", + " )\n", + " max_src_len : Optional[int] = field(\n", + " \n", + " default= 512,\n", + " metadata= {\"help\": \"length of source seq\" }\n", + " )\n", + " max_tar_len : Optional[int] = field(\n", + " \n", + " default= 64,\n", + " metadata= {\"help\": \"length of target seq\" }\n", + " )\n", + " # this is parsed from training args\n", + " # out_dir: Optional[str] = field(\n", + " # default= \"\",\n", + " # metadata= {\"help\": \"path of data dir to save trained weights and out put\"}\n", + " # )\n", + " from_pretrained : Optional[bool] = field(\n", + " default= True,\n", + " metadata= {\"help\": \"wether to load model weight from a pretrained checkpoint or from scratch\"}\n", + " )\n", + " predict_only : Optional[bool] = field(\n", + " default= False,\n", + " metadata= {\"help\": \"wether to predict only or train, validate and predict\"}\n", + " )\n", + " dataset_class : Optional[str] = field(\n", + " default= \"single\",\n", + " metadata= {\"help\": \"single | multiple , type of dataset reader to use, split train data into mltiple train file or from single\" }\n", + " )" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "TgUthX_SaXRY" + }, + "source": [ + "#datset\n", + "import os, sys\n", + "import torch\n", + "import json\n", + "from torch.utils.data.dataset import Dataset\n", + "class KPone2manyDataset(Dataset):\n", + " def __init__(self, tokenizer, file_path, max_src_len, max_tar_len, kp_sep_token = \"\"):\n", + " '''\n", + " file should contain json in each line with\n", + " \"text\": string and \" key phrase\": list[str] containing all kp\n", + " '''\n", + " assert os.path.exists(file_path)\n", + " self.abst= []\n", + " self.kps= []\n", + " self.src_attn_mask = []\n", + " self.tokenizer = tokenizer\n", + " with open(file_path, encoding=\"utf-8\") as f:\n", + " for line in f:\n", + " d=json.loads(line)\n", + " self.abst.append(d['text'])\n", + " curr_kp= \"\"\n", + " for (i,kp) in enumerate(d['kp']):\n", + " if i !=0:\n", + " curr_kp += \" \" + kp_sep_token +\" \"\n", + " curr_kp += kp.strip()\n", + " \n", + " self.kps.append(curr_kp)\n", + " \n", + " assert len(self.kps) == len(self.abst)\n", + " self.ex_len= len(self.abst)\n", + " self.kps= self.tokenizer.batch_encode_plus(self.kps, truncation=True, max_length= max_tar_len, pad_to_max_length= True)\n", + " self.abst= self.tokenizer.batch_encode_plus(self.abst, truncation=True, max_length= max_src_len, pad_to_max_length= True)\n", + "\n", + " def __len__(self):\n", + " return self.ex_len\n", + "\n", + " def __getitem__(self, i):\n", + " return {\n", + " 'src_ids': torch.tensor(self.abst['input_ids'][i]),\n", + " 'tar_ids': torch.tensor(self.kps['input_ids'][i]),\n", + " 'src_attn': torch.tensor(self.abst['attention_mask'][i]),\n", + " 'tar_attn': torch.tensor(self.kps['attention_mask'][i])\n", + " }\n", + "\n", + "# class kpone2manyMultiDataset(Dataset):\n", + "# def __init__(self, tokenizer, data_dir, file_prefix, n=10000, max_src_len, max_tar_len, kp_sep_token = \"\"):\n", + "# self.tokenizer = tokenizer\n", + "# self.data_dir = data_dir\n", + "# self.file_prefix = file_prefix\n", + "# self.total_examples = n\n", + "# self.max_src_len = max_src_len\n", + "# self.max_tar_len = max_tar_len\n", + "# self.kp_sep_token = kp_sep_token\n", + "\n", + "# assert os.path.exists(self.data_dir)\n", + "\n", + "\n", + "\n", + " \n", + "# pass\n", + "# def read_files(self):\n", + "# pass\n", + "\n", + "# def __len__(self):\n", + "# pass\n", + " \n", + "# def __getitem__(self,i):\n", + "# pass\n", + "\n", + "\n", + "\n", + "# super dataset class\n", + "def load_kp_data_and_dataset_class( tokenizer, file_path, max_src_len, max_tar_len, kp_sep_token = \"\"):\n", + " from datasets import load_dataset\n", + " def tok_and_process(d):\n", + " curr_kp= \"\"\n", + " for (i,kp) in enumerate(d['kp']):\n", + " if i !=0:\n", + " curr_kp += \" \" + kp_sep_token +\" \"\n", + " curr_kp += kp.strip()\n", + " src_encode= tokenizer(d['text'], truncation=True, max_length= max_tar_len, pad_to_max_length= True)\n", + " tar_encode= tokenizer(curr_kp, truncation=True, max_length= max_tar_len, pad_to_max_length= True)\n", + " d['input_ids'] = src_encode['input_ids']\n", + " d['decoder_input_ids']= tar_encode['input_ids']\n", + " d['attention_mask']= src_encode['attention_mask']\n", + " # d['tar_attn'] = tar_encode['attention_mask']\n", + "\n", + " return d\n", + " \n", + "\n", + " dataset = load_dataset('json', data_files= file_path, split='train')\n", + " dataset= dataset.map(tok_and_process)\n", + " dataset.set_format(type='torch', columns=['input_ids', 'decoder_input_ids', 'attention_mask'])\n", + "\n", + " return dataset\n", + "\n", + "\n", + "\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zL3jsUs2aXUI" + }, + "source": [ + "#collate\n", + "import os, sys\n", + "import torch\n", + "class TPBDataCollator():\n", + " def __init__(self, tokenizer, need_to_shift= False, start_tok_id= None):\n", + " self.tokenizer = tokenizer\n", + " self.shift_right= need_to_shift\n", + " self.dec_start_tok_id= self.tokenizer.pad_token_id if start_tok_id is None else start_tok_id #generally same as pad token id\n", + "\n", + "\n", + " def __call__(self, ex):\n", + " # print(ex)\n", + " src_ids= torch.stack([e['input_ids'] for e in ex])\n", + " tar_ids= torch.stack([e['decoder_input_ids'] for e in ex])\n", + " src_attn_mask= torch.stack([e['attention_mask'] for e in ex])\n", + " # src_ids= [e['src_ids'] for e in ex]\n", + " # tar_ids= [e['tar_ids'] for e in ex]\n", + " # src_attn_mask= [e['src_attn'] for e in ex]\n", + " # tar_attn_mask = torch.stack([e['tar_attn'] for e in ex])\n", + " # create labels for loss calcualtiona\n", + " labels= tar_ids.clone()\n", + " labels[labels[:]== self.tokenizer.pad_token_id] = -100 #ignore loss at pad token ids\n", + "\n", + " # get decoder input ids\n", + "\n", + " if self.shift_right: # either shift right for bart/pegasus/t5 or pass decodeer ids as none for bart/ pegasus then it will create decoder ids by shifting labels to right\n", + " decoder_ids= self.right_shift(tar_ids)\n", + " \n", + " else:\n", + " decoder_ids= tar_ids\n", + "\n", + "\n", + " batch = {\n", + " \"input_ids\": src_ids,\n", + " \"attention_mask\" : src_attn_mask,\n", + " \"decoder_input_ids\": decoder_ids,\n", + " \"labels\": labels\n", + " }\n", + "\n", + " return batch\n", + "\n", + "\n", + " def right_shift(self, input_ids):\n", + " pad_token_id= self.dec_start_tok_id # same as pad token id\n", + " prev_output_tokens = input_ids.clone()\n", + " assert pad_token_id is not None, \"self.model.config.pad_token_id has to be defined.\"\n", + " # replace possible -100 values in labels by `pad_token_id`\n", + " prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)\n", + "\n", + " index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)\n", + " decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()\n", + " prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()\n", + " prev_output_tokens[:, 0] = decoder_start_tokens\n", + " return prev_output_tokens\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "W8r6v-NJaXWs" + }, + "source": [ + "#main\n", + "import os, sys\n", + "# from utils import arg_parse\n", + "from transformers import (\n", + " AutoTokenizer,\n", + " EncoderDecoderModel,\n", + " BartTokenizerFast,\n", + " AutoModelForSeq2SeqLM,\n", + " AutoConfig,\n", + " Trainer,\n", + " TrainingArguments,\n", + " HfArgumentParser\n", + ")\n", + "\n", + "# from dataset_fn import *\n", + "# from collate_fn import *\n", + "\n", + "\n", + "COLLATE_DICT= {\n", + " 't5': TPBDataCollator\n", + "\n", + "\n", + "}\n", + "\n", + "DATASET_DICT= {\n", + " # 'one2many_single': KPone2manyDataset\n", + " 'one2many_single': load_kp_data_and_dataset_class\n", + "\n", + "}\n", + "\n", + "CONFIG_MAP = {\n", + "\n", + "}\n", + "\n", + "\n", + "TOKENIZER_MAP = {\n", + " \n", + "\n", + "}\n", + "\n", + "MODEL_MAP = {\n", + "\n", + "}\n", + "# TODO\n", + "# modify tokenizer in main function if there is requirement of special token addition and stuff\n", + "# chek if there is crosss ateention enabled in decoder part of the model and its working\n", + "# see if special token needed and shifting or other requirement->>>> one at a time\n", + "# 1. bart model\n", + "# 2. t5\n", + "# 3. pegasus \n", + "# add token in every tokenizer and keeep rest same, qg has better logic\n", + "#token shifting in bart t5 pegasus\n", + " # t5 tokenizer genrate token as required( there is need to shift right), but bart and pegasus add cls/bos and sep/eos in start and end and it also shifts automatically\n", + " # for bart and pegasus simply copying target seq as labels and target seq as decodeer ip could be tried as these model automatically shift to right\n", + " #final: bart shifts label (i.e target seq ) to right if passed decoder ip ids is none so only labels and input ids can be passed can be passed. if you want you cann remove [cls]/[sep] token as required\n", + " #pegasus; same as bart\n", + "#encode decoder: look for shifting\n", + " # cls can be use as bos and sep as eos: this is mentioned in HF blogs\n", + "# how to levare seq2seq trainer or trainer directly\n", + " # trainer and seq2seq trainer seems to be the same thing, we can try them alternative and can see which is best\n", + " # \n", + "\n", + "# add compute metrics\n", + "\n", + "# add do predict and generate function option\n", + "\n", + "def main_fn(args= None, training_args = None):\n", + " #ars parsing\n", + " # parser= HfArgumentParser((BasicKPArgs, TrainingArguments))\n", + " # args , training_args = parser.parse_args_into_dataclasses()\n", + " \n", + " #load tokenizer\n", + " if args.tokenizer_path is not None:\n", + " tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)\n", + " else:\n", + " tokenizer= AutoTokenizer.from_pretrained(args.model_name_path)\n", + " tokenizer.add_tokens([''])\n", + " # tokenizer.sep_token = \"\"\n", + " #save tokenizer\n", + " tok_path= training_args.output_dir+\"/kp_{}_tokenizer\".format(args.model_name_path )\n", + " if not os.path.exists(tok_path):\n", + " os.mkdir(tok_path)\n", + " tokenizer.save_pretrained(tok_path)\n", + "\n", + " \n", + " #load model\n", + " if args.model_type == \"enc_dec\":\n", + " model =None\n", + " else:\n", + " if args.from_pretrained:\n", + " model = AutoModelForSeq2SeqLM.from_pretrained(\n", + " args.model_name_path\n", + " )\n", + " else:\n", + " config= AutoConfig.from_pretrained(args.model_name_path) #get the config file to load weight from scratch\n", + " model= AutoModelForSeq2SeqLM.from_config(config) #load model with random weight from config\n", + "\n", + " #resize model embedding\n", + " model.resize_token_embeddings(len(tokenizer))\n", + "\n", + " #freeze model embedding\n", + "\n", + " #datset class\n", + " \n", + " train_data_set= DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir + \"/train.txt\", max_src_len= args.max_src_len, max_tar_len = args.max_tar_len)\n", + "\n", + " eval_data_set= DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+\"/val.txt\", max_src_len= args.max_src_len, max_tar_len = args.max_tar_len)\n", + " \n", + " # print(train_data_set)\n", + "\n", + " #data collator\n", + " data_collator= COLLATE_DICT[args.model_type](tokenizer= tokenizer, need_to_shift= True)\n", + "\n", + " trainer= Trainer(model= model,\n", + " args= training_args,\n", + " data_collator= data_collator,\n", + " train_dataset = train_data_set,\n", + " eval_dataset= eval_data_set,\n", + " # compute_metrics= None, # metrics to compute scores,\n", + "\n", + "\n", + " )\n", + " \n", + " if args.predict_only:\n", + " test_data_set = DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+\"/test.txt\", max_src_len= args.max_src_len, max_tar_len = args.max_tar_len)\n", + " \n", + " \n", + " trainer.train()\n", + "\n", + "\n", + " \n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SognzQU6aXY7" + }, + "source": [ + "def runner():\n", + " args= BasicKPArgs(\n", + " model_type = 't5',\n", + " model_name_path = 't5-base', #todo\n", + " data_dir= \"/content\", #todo\n", + " kp_task_type= \"one2many\",\n", + " dataset_class= 'single'\n", + " )\n", + " training_args = TrainingArguments(\n", + " output_dir= \"/content/tk_out\", #todo\n", + " overwrite_output_dir = True,\n", + " num_train_epochs = 2,\n", + " per_device_train_batch_size = 8,\n", + " do_eval= True,\n", + " evaluation_strategy = \"epoch\",\n", + " save_steps = 1\n", + " \n", + " \n", + "\n", + " )\n", + " main_fn(args, training_args)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/", + "height": 290 + }, + "id": "EMXA4Yi6aXbV", + "outputId": "f091bbbd-d403-48a9-8089-ce746dd66cb3" + }, + "source": [ + "runner()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n", + "- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Using custom data configuration default\n", + "Reusing dataset json (/root/.cache/huggingface/datasets/json/default-0a0c845d87888c0a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n", + "/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:2179: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " FutureWarning,\n", + "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-0a0c845d87888c0a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-463e16185536cbff.arrow\n", + "Using custom data configuration default\n", + "Reusing dataset json (/root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n", + "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-cea35ead7b156669.arrow\n" + ], + "name": "stderr" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " \n", + " [2/6 : < :, Epoch 0.33/2]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation Loss

" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "error", + "ename": "RuntimeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_open_zipfile_writer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m \u001b[0m_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_protocol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 373\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36m_save\u001b[0;34m(obj, zip_file, pickle_module, pickle_protocol)\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0mnum_bytes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0melement_size\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0mzip_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_record\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_ptr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_bytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrunner\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mrunner\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m )\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mmain_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mmain_fn\u001b[0;34m(args, training_args)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, model_path, trial)\u001b[0m\n\u001b[1;32m 836\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_step_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 837\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 838\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_log_save_evaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtr_loss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepoch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 839\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 840\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_epoch_stop\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_training_stop\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, model, trial, epoch)\u001b[0m\n\u001b[1;32m 908\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 909\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshould_save\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 910\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_save_checkpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 911\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 912\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_save_checkpoint\u001b[0;34m(self, model, trial, metrics)\u001b[0m\n\u001b[1;32m 941\u001b[0m \u001b[0mreissue_pt_warnings\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcaught_warnings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 942\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_world_process_zero\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 943\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"optimizer.pt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 944\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mwarnings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcatch_warnings\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcaught_warnings\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 945\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlr_scheduler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"scheduler.pt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_open_zipfile_writer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 372\u001b[0m \u001b[0m_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopened_zipfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_protocol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 373\u001b[0;31m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 374\u001b[0m \u001b[0m_legacy_save\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopened_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpickle_protocol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 375\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/serialization.py\u001b[0m in \u001b[0;36m__exit__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__exit__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 259\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_like\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_end_of_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 260\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:274] . unexpected pos 1606179904 vs 1606179792" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5CaaHimjIfOR" + }, + "source": [ + "# 3 eval_kp.py\n", + "import os, sys\n", + "# from utils import arg_parse\n", + "from transformers import (\n", + " AutoTokenizer,\n", + " EncoderDecoderModel,\n", + " BartTokenizerFast,\n", + " AutoModelForSeq2SeqLM,\n", + " AutoConfig,\n", + " Trainer,\n", + " TrainingArguments,\n", + " HfArgumentParser\n", + ")\n", + "\n", + "import torch\n", + "\n", + "\n", + "@dataclass\n", + "class EvalArgs:\n", + " model_type : Optional[str] = field(\n", + " default=\"enc_dec\",\n", + " metadata= {\"help\": \"encoder decoder type or other generative model like Bart\"}\n", + " )\n", + "\n", + " model_name_path : Optional[str] = field(\n", + " default= None,\n", + " metadata= {\"help\": \"path or name to load pretrained model or from checkpoints\"}\n", + " )\n", + " tokenizer_path : Optional[str] = field(\n", + " default= None,\n", + " metadata= {\"help\": \"path or name of custom tokenizer saved if provided this tokenizer will be loaded else auto tokenizer\"}\n", + " )\n", + " data_dir : Optional[str] = field(\n", + " default= \"\",\n", + " metadata= {\"help\": \"path to dir containg data\"}\n", + " )\n", + " kp_task_type : Optional[str] = field(\n", + " default= \"one2one\",\n", + " metadata= {\"help\": \"wether to use one2one or one2many\"}\n", + " )\n", + " dataset_class : Optional[str] = field(\n", + " default= \"single\",\n", + " metadata= {\"help\": \"single | multiple , type of dataset reader to use, split train data into mltiple train file or from single\" }\n", + " )\n", + " beam_size : Optional[int] = field(\n", + " \n", + " default= 4,\n", + " metadata= {\"help\": \"beam_size\" }\n", + " )\n", + " max_pre_len : Optional[int] = field(\n", + " \n", + " default= 64,\n", + " metadata= {\"help\": \"length of target seq\" }\n", + " )\n", + " max_src_len : Optional[int] = field(\n", + " \n", + " default= 512,\n", + " metadata= {\"help\": \"length of source seq\" }\n", + " )\n", + "\n", + "COLLATE_DICT= {\n", + " 't5': TPBDataCollator\n", + "\n", + "\n", + "}\n", + "def main_eval(args= None):\n", + " # p = HfArgumentParser((EvalArgs,))\n", + " # args= p.parse_args_into_dataclasses()[0]\n", + " \n", + "\n", + " device= 'cuda' if torch.cuda.is_available else 'cpu'\n", + " print(\"device \", device)\n", + " if args.tokenizer_path is not None:\n", + " tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path )\n", + " else:\n", + " tokenizer= AutoTokenizer.from_pretrained(args.model_name_path)\n", + " tokenizer.add_tokens([''])\n", + "\n", + " if args.model_type == \"enc_dec\":\n", + " model =None\n", + " else:\n", + " model = AutoModelForSeq2SeqLM.from_pretrained(\n", + " args.model_name_path\n", + " )\n", + " data_collator= COLLATE_DICT[args.model_type](tokenizer= tokenizer, need_to_shift= True)\n", + " test_data_set = DATASET_DICT[args.kp_task_type+\"_\"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+\"/test.txt\", max_src_len= args.max_src_len, max_tar_len = args.max_pre_len)\n", + "\n", + " data_loader= torch.utils.data.DataLoader(test_data_set, batch_size= 16, collate_fn= data_collator)\n", + "\n", + " model.to(device)\n", + " model.eval()\n", + " out_writer= open(args.data_dir+\"prediction.txt\", 'w')\n", + "\n", + " with torch.no_grad():\n", + " for ex in data_loader:\n", + " generated= model.generate(\n", + " input_ids= ex['input_ids'].to(device),\n", + " attention_mask= ex['attention_mask'].to(device),\n", + " num_beams= args.beam_size,\n", + " max_length= args.max_pre_len\n", + "\n", + "\n", + " )\n", + "\n", + " pre= [tokenizer.decode(op, skip_special_token= True) for op in generated]\n", + " for p in pre:\n", + " out_writer.write(p+\"\\n\")\n", + " \n", + " \n", + " print(\"files written in dir {} as prediction.txt \".format(args.data_dir))\n", + "\n", + " out_writer.close()\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BP6VdOBZIfRi" + }, + "source": [ + "args= EvalArgs(\n", + " model_type= 't5',\n", + " model_name_path= \"/content/tk_out/checkpoint-6\",\n", + " tokenizer_path= \"/content/tk_out/kp_t5-base_tokenizer/\",\n", + " data_dir= \"/content/\", #todo\n", + " kp_task_type= \"one2many\",\n", + " dataset_class= 'single',\n", + " beam_size= 4,\n", + " max_pre_len = 64\n", + "\n", + "\n", + " )\n", + "main_eval(args)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iU7vhi4nbnmx" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "joA6Zo0SIfUf" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iVByVZnaaXdl" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 000b0375984252f09517a10b1ca13b7fce701b28 Mon Sep 17 00:00:00 2001 From: Amardeep Date: Sun, 23 Jan 2022 21:17:37 +0530 Subject: [PATCH 03/17] beautify --- dlkp/models/ke/crf/crf.py | 24 +- dlkp/models/ke/crf/crf_trainer.py | 140 ++-- dlkp/models/ke/crf/crf_utils.py | 29 +- dlkp/models/ke/kpe.py | 631 ++++++++++++++++++ dlkp/models/ke/transformer/crf_models.py | 152 +++++ .../token_classification_models.py | 94 +++ notebooks/tranKP.ipynb | 189 ++---- 7 files changed, 1060 insertions(+), 199 deletions(-) diff --git a/dlkp/models/ke/crf/crf.py b/dlkp/models/ke/crf/crf.py index 8d5bd30..2ab2181 100644 --- a/dlkp/models/ke/crf/crf.py +++ b/dlkp/models/ke/crf/crf.py @@ -5,7 +5,8 @@ import torch -VITERBI_DECODING = Tuple[List[int], float] +VITERBI_DECODING = Tuple[List[int], float] + class ConditionalRandomField(torch.nn.Module): """ @@ -24,7 +25,6 @@ class ConditionalRandomField(torch.nn.Module): Whether to include the start and end transition parameters. """ - def __init__( self, num_tags: int, @@ -64,7 +64,9 @@ def reset_parameters(self): torch.nn.init.normal_(self.start_transitions) torch.nn.init.normal_(self.end_transitions) - def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: + def _input_likelihood( + self, logits: torch.Tensor, mask: torch.BoolTensor + ) -> torch.Tensor: """ Computes the (batch_size,) denominator term for the log-likelihood, which is the sum of the likelihoods across all possible state sequences. @@ -161,7 +163,9 @@ def _joint_likelihood( # Add the last input if it's not masked. last_inputs = logits[-1] # (batch_size, num_tags) - last_input_score = last_inputs.gather(1, last_tags.view(-1, 1)) # (batch_size, 1) + last_input_score = last_inputs.gather( + 1, last_tags.view(-1, 1) + ) # (batch_size, 1) last_input_score = last_input_score.squeeze() # (batch_size,) score = score + last_transition_score + last_input_score * mask[-1] @@ -181,7 +185,7 @@ def forward( # The code below fails in weird ways if this isn't a bool tensor, so we make sure. mask = mask.to(torch.bool) # print("forward",inputs.shape, tags.shape, mask.shape) - + log_denominator = self._input_likelihood(inputs, mask) # temp_tags= tags # tags[tags==-100]=2 @@ -235,9 +239,13 @@ def viterbi_tags( ].data + -10000.0 * ( 1 - self._constraint_mask[start_tag, :num_tags].detach() ) - transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[ + transitions[ :num_tags, end_tag - ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach()) + ] = self.end_transitions.detach() * self._constraint_mask[ + :num_tags, end_tag + ].data + -10000.0 * ( + 1 - self._constraint_mask[:num_tags, end_tag].detach() + ) else: transitions[start_tag, :num_tags] = -10000.0 * ( 1 - self._constraint_mask[start_tag, :num_tags].detach() @@ -280,4 +288,4 @@ def viterbi_tags( if flatten_output: return [top_k_paths[0] for top_k_paths in best_paths] - return best_paths \ No newline at end of file + return best_paths diff --git a/dlkp/models/ke/crf/crf_trainer.py b/dlkp/models/ke/crf/crf_trainer.py index d9b22df..5db3178 100644 --- a/dlkp/models/ke/crf/crf_trainer.py +++ b/dlkp/models/ke/crf/crf_trainer.py @@ -1,17 +1,16 @@ from transformers import ( - Trainer, set_seed, - ) -from transformers.trainer import * +from transformers.trainer import * from transformers.trainer_utils import PredictionOutput from torch import nn from torch.utils.data.dataloader import DataLoader + # from torch.utils.data.dataset import Dataset # from typing import Any, Callable, Dict, List, Optional, Tuple, Union class CRF_Trainer(Trainer): - def prediction_loop( + def prediction_loop( self, dataloader: DataLoader, description: str, @@ -27,13 +26,17 @@ def prediction_loop( if not isinstance(dataloader.dataset, collections.abc.Sized): raise ValueError("dataset must implement __len__") prediction_loss_only = ( - prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only + prediction_loss_only + if prediction_loss_only is not None + else self.args.prediction_loss_only ) if self.args.deepspeed and not self.args.do_train: # no harm, but flagging to the user that deepspeed config is ignored for eval # flagging only for when --do_train wasn't passed as only then it's redundant - logger.info("Detected the deepspeed argument but it will not be used for evaluation") + logger.info( + "Detected the deepspeed argument but it will not be used for evaluation" + ) model = self._wrap_model(self.model, training=False) @@ -53,65 +56,98 @@ def prediction_loop( world_size = max(1, self.args.world_size) - eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) + eval_losses_gatherer = DistributedTensorGatherer( + world_size, num_examples, make_multiple_of=batch_size + ) if not prediction_loss_only: # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass # a batch size to the sampler) make_multiple_of = None - if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler): + if hasattr(dataloader, "sampler") and isinstance( + dataloader.sampler, SequentialDistributedSampler + ): make_multiple_of = dataloader.sampler.batch_size - preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) - labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + preds_gatherer = DistributedTensorGatherer( + world_size, num_examples, make_multiple_of=make_multiple_of + ) + labels_gatherer = DistributedTensorGatherer( + world_size, num_examples, make_multiple_of=make_multiple_of + ) if self.args.past_index >= 0: self._past = None model.eval() - - if is_torch_tpu_available(): - dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) - + if is_torch_tpu_available(): + dataloader = pl.ParallelLoader( + dataloader, [self.args.device] + ).per_device_loader(self.args.device) self.callback_handler.eval_dataloader = dataloader - + for step, inputs in enumerate(dataloader): - loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) - - best_path= self.eval_step(model, logits, inputs['attention_mask']) + loss, logits, labels = self.prediction_step( + model, inputs, prediction_loss_only, ignore_keys=ignore_keys + ) + + best_path = self.eval_step(model, logits, inputs["attention_mask"]) # best_path= self.eval_step(model, logits) # print(len(best_path), best_path[0]) # logits= torch.zeros() - best_path= [x for x,_ in best_path] + best_path = [x for x, _ in best_path] # print(best_path) # seq_len= labels.shape[1] - logits*=0 - for i,path in enumerate(best_path): - # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1]) - # print(len(x)) - for j, tag in enumerate(path): - logits[i,j,int(tag)]=1 - # print(inputs['attention_mask'][i,j], labels[i,j]) - + logits *= 0 + for i, path in enumerate(best_path): + # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1]) + # print(len(x)) + for j, tag in enumerate(path): + logits[i, j, int(tag)] = 1 + # print(inputs['attention_mask'][i,j], labels[i,j]) + # logits= torch.tensor(data=best_path, dtype= labels.dtype, device= labels.device) # if(logits.shape!=labels.shape): # print(logits.shape,labels.shape) # assert logits.shape==labels.shape if loss is not None: losses = loss.repeat(batch_size) - losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) + losses_host = ( + losses + if losses_host is None + else torch.cat((losses_host, losses), dim=0) + ) if logits is not None: - preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + preds_host = ( + logits + if preds_host is None + else nested_concat(preds_host, logits, padding_index=-100) + ) if labels is not None: - labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) - self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) + labels_host = ( + labels + if labels_host is None + else nested_concat(labels_host, labels, padding_index=-100) + ) + self.control = self.callback_handler.on_prediction_step( + self.args, self.state, self.control + ) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: - eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) + if ( + self.args.eval_accumulation_steps is not None + and (step + 1) % self.args.eval_accumulation_steps == 0 + ): + eval_losses_gatherer.add_arrays( + self._gather_and_numpify(losses_host, "eval_losses") + ) if not prediction_loss_only: - preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) - labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + preds_gatherer.add_arrays( + self._gather_and_numpify(preds_host, "eval_preds") + ) + labels_gatherer.add_arrays( + self._gather_and_numpify(labels_host, "eval_label_ids") + ) # Set back to None to begin a new accumulation losses_host, preds_host, labels_host = None, None, None @@ -121,17 +157,29 @@ def prediction_loop( delattr(self, "_past") # Gather all remaining tensors and put them back on the CPU - eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) + eval_losses_gatherer.add_arrays( + self._gather_and_numpify(losses_host, "eval_losses") + ) if not prediction_loss_only: - preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) - labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + preds_gatherer.add_arrays( + self._gather_and_numpify(preds_host, "eval_preds") + ) + labels_gatherer.add_arrays( + self._gather_and_numpify(labels_host, "eval_label_ids") + ) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not prediction_loss_only else None label_ids = labels_gatherer.finalize() if not prediction_loss_only else None - if self.compute_metrics is not None and preds is not None and label_ids is not None: - metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) + if ( + self.compute_metrics is not None + and preds is not None + and label_ids is not None + ): + metrics = self.compute_metrics( + EvalPrediction(predictions=preds, label_ids=label_ids) + ) else: metrics = {} @@ -147,20 +195,14 @@ def prediction_loop( metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) - def eval_step(self, - model: nn.Module, - logits, - mask= None, - top_k= None - ): + def eval_step(self, model: nn.Module, logits, mask=None, top_k=None): with torch.no_grad(): - output= model.crf.viterbi_tags(logits, mask, top_k) + output = model.crf.viterbi_tags(logits, mask, top_k) return output - - def compute_loss(self, model, inputs, return_outputs=False): + def compute_loss(self, model, inputs, return_outputs=False): """ How the loss is computed by Trainer. By default, all models return the loss in the first element. diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py index 295aeef..1d1c240 100644 --- a/dlkp/models/ke/crf/crf_utils.py +++ b/dlkp/models/ke/crf/crf_utils.py @@ -11,7 +11,9 @@ VITERBI_DECODING = Tuple[List[int], float] # a list of tags, and a viterbi score -def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]: +def allowed_transitions( + constraint_type: str, labels: Dict[int, str] +) -> List[Tuple[int, int]]: """ Given labels and a constraint type, returns the allowed transitions. It will additionally include transitions for the start and end states, which are used @@ -30,7 +32,10 @@ def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tu num_labels = len(labels) start_tag = num_labels end_tag = num_labels + 1 - labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")] + labels_with_boundaries = list(labels.items()) + [ + (start_tag, "START"), + (end_tag, "END"), + ] allowed = [] for from_label_index, from_label in labels_with_boundaries: @@ -47,7 +52,9 @@ def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tu else: to_tag = to_label[0] to_entity = to_label[1:] - if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity): + if is_transition_allowed( + constraint_type, from_tag, from_entity, to_tag, to_entity + ): allowed.append((from_label_index, to_label_index)) return allowed @@ -97,7 +104,9 @@ def is_transition_allowed( from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"), # B-x can only transition to I-x or L-x # I-x can only transition to I-x or L-x - from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity, + from_tag in ("B", "I") + and to_tag in ("I", "L") + and from_entity == to_entity, ] ) elif constraint_type == "BIO": @@ -148,7 +157,9 @@ def is_transition_allowed( print("error in constrint type") -def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor: +def logsumexp( + tensor: torch.Tensor, dim: int = -1, keepdim: bool = False +) -> torch.Tensor: """ A numerically stable computation of logsumexp. This is mathematically equivalent to `tensor.exp().sum(dim, keep=keepdim).log()`. This function is typically used for summing log @@ -169,8 +180,6 @@ def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> tor return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log() - - def viterbi_decode( tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, @@ -223,7 +232,9 @@ def viterbi_decode( elif top_k >= 1: flatten_output = False else: - raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}") + raise ValueError( + f"top_k must be either None or an integer >=1. Instead received {top_k}" + ) sequence_length, num_tags = list(tag_sequence.size()) @@ -343,4 +354,4 @@ def viterbi_decode( if flatten_output: return viterbi_paths[0], viterbi_scores[0] - return viterbi_paths, viterbi_scores \ No newline at end of file + return viterbi_paths, viterbi_scores diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index e69de29..29e4287 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -0,0 +1,631 @@ +# run_kpe.py +# all long docu,emt modesl realted to KP +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for token classification. +""" +# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as +# comments. + +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +from datasets import ClassLabel, load_dataset, load_metric + +import transformers +from transformers import ( + AutoConfig, + AutoModelForTokenClassification, + AutoTokenizer, + DataCollatorForTokenClassification, + HfArgumentParser, + PreTrainedTokenizerFast, + Trainer, + TrainingArguments, + set_seed, + BertForTokenClassification, +) +from transformers.trainer_utils import get_last_checkpoint, is_main_process + + +logger = logging.getLogger(__name__) +# from models.long_doc_kp_models import LONG_DOC_KP_MODELS + +# KPE_MODELS_DICT={ +# 'others': AutoModelForTokenClassification, +# "longformer": +# 'reformer' : +# 'crf_longformer' : +# 'crf_bert': BERT_CRFforTokenClassification +# } + +CRF_MODEL_DICT = { + "bert": BERT_CRFforTokenClassification, + "longformer": Longformer_CRFforTokenClassification, +} +TOKEN_MODEL_DICT = { + "bert": BertForTokenClassification, + "longformer": LongformerForTokenClassification, + "reformer": ReformerForTokenClassification, +} + +MODEL_DICT = {"crf": CRF_MODEL_DICT, "simple": TOKEN_MODEL_DICT} + +# KPE_MODELS_DICT = KPE_MODELS_DICT | LONG_DOC_KP_MODELS + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_family_name: str = field( + metadata={ + "help": "name of the family of model, bert, longformer, reformer etc." + } + ) + model_name_or_path: str = field( + metadata={ + "help": "Path to pretrained model or model identifier from huggingface.co/models" + } + ) + config_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained config name or path if not the same as model_name" + }, + ) + tokenizer_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained tokenizer name or path if not the same as model_name" + }, + ) + cache_dir: Optional[str] = field( + default=None, + metadata={ + "help": "Where do you want to store the pretrained models downloaded from huggingface.co" + }, + ) + model_revision: str = field( + default="main", + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)." + }, + ) + use_CRF: bool = field( + default=False, + metadata={"help": "wether to use CRF on top of the classifier"}, + ) + use_BiLSTM: bool = field( + default=False, + metadata={"help": "use BiLSTM in sequence classification"}, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + task_name: Optional[str] = field( + default="simple", metadata={"help": "The name of the task simple, crf"} + ) + + train_file: Optional[str] = field( + default=None, + metadata={"help": "The input training data file (a csv or JSON file)."}, + ) + validation_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)." + }, + ) + test_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input test data file to predict on (a csv or JSON file)." + }, + ) + overwrite_cache: bool = field( + default=False, + metadata={"help": "Overwrite the cached training and evaluation sets"}, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + label_all_tokens: bool = field( + default=False, + metadata={ + "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " + "one (in which case the other tokens will have a padding index)." + }, + ) + return_entity_level_metrics: bool = field( + default=False, + metadata={ + "help": "Whether to return all the entity levels during evaluation or just the overall ones." + }, + ) + dataset_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the dataset to use (via the datasets library)."}, + ) + dataset_config_name: Optional[str] = field( + default=None, + metadata={ + "help": "The configuration name of the dataset to use (via the datasets library)." + }, + ) + cache_file_name: Optional[str] = field( + default=None, + metadata={ + "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name." + }, + ) + + def __post_init__(self): + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + ): + raise ValueError( + "Need either a dataset name or a training/validation file." + ) + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in [ + "csv", + "json", + ], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in [ + "csv", + "json", + ], "`validation_file` should be a csv or a json file." + self.task_name = self.task_name.lower() + + +# def main(): +TRAINER_DICT = { + "crf": CRF_Trainer, + "simple": Trainer, +} + + +def main_run_kpe(model_args, data_args, training_args): + + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # # If we pass only one argument to the script and it's the path to a json file, + # # let's parse it to get our arguments. + # model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + # else: + # model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Detecting last checkpoint. + last_checkpoint = None + if ( + os.path.isdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel( + logging.INFO if is_main_process(training_args.local_rank) else logging.WARN + ) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + ## get dataset in here + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.train_file.split(".")[-1] + datasets = load_dataset( + extension, data_files=data_files + ) ##CR get dataset in here + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + if training_args.do_train: + column_names = datasets["train"].column_names + features = datasets["train"].features + else: + column_names = datasets["validation"].column_names + features = datasets["validation"].features + text_column_name = "text" if "text" in column_names else column_names[0] + label_column_name = "BIO_tags" if "BIO_tags" in column_names else column_names[1] + + # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the + # unique labels. + def get_label_list(labels): + unique_labels = set() + for label in labels: + unique_labels = unique_labels | set(label) + label_list = list(unique_labels) + label_list.sort() + return label_list + + if isinstance(features[label_column_name].feature, ClassLabel): + label_list = features[label_column_name].feature.names + # No need to convert the labels since they are already ints. + label_to_id = {i: i for i in range(len(label_list))} + else: + label_list = get_label_list(datasets["train"][label_column_name]) + label_to_id = {l: i for i, l in enumerate(label_list)} + num_labels = len(label_list) + print(label_to_id) + id2tag = {} + for k in label_to_id.keys(): + id2tag[label_to_id[k]] = k + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name + if model_args.config_name + else model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=model_args.cache_dir, + ) + config.use_CRF = model_args.use_CRF ##CR replace from arguments + config.use_BiLSTM = False + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name + if model_args.tokenizer_name + else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=True, + add_prefix_space=True, + ) + model = MODEL_DICT[data_args.task_name][ + model_args.model_family_name + ].from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=model_args.cache_dir, + ) + model.freeze_encoder_layer() + print("model") + print(model) + if tokenizer.pad_token is None: + + tokenizer.pad_token = tokenizer.eos_token + config.pad_token_id = config.eos_token_id + + # Tokenizer check: this script requires a fast tokenizer. + # if not isinstance(tokenizer, PreTrainedTokenizerFast): + # raise ValueError( + # "This example script only works for models that have a fast tokenizer. Checkout the big table of models " + # "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " + # "requirement" + # ) + + # Preprocessing the dataset + # Padding strategy + padding = "max_length" if data_args.pad_to_max_length else False + + # Tokenize all texts and align the labels with them. + def tokenize_and_align_labels(examples): + tokenized_inputs = tokenizer( + examples[text_column_name], + padding=padding, + truncation=True, + # We use this argument because the texts in our dataset are lists of words (with a label for each word). + is_split_into_words=True, + ) + labels = [] + for i, label in enumerate(examples[label_column_name]): + word_ids = tokenized_inputs.word_ids(batch_index=i) + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + # Special tokens have a word id that is None. We set the label to -100 so they are automatically + # ignored in the loss function. + if word_idx is None: + # label_ids.append(-100) + label_ids.append( + 2 + ) # to avoid error change -100 to 'O' tag i.e. 2 class + # We set the label for the first token of each word. + elif word_idx != previous_word_idx: + label_ids.append(label_to_id[label[word_idx]]) + # For the other tokens in a word, we set the label to either the current label or -100, depending on + # the label_all_tokens flag. + else: + label_ids.append( + label_to_id[label[word_idx]] + if data_args.label_all_tokens + else -100 + ) + # to avoid error change -100 to 'O' tag i.e. 2 class + # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2) + previous_word_idx = word_idx + + labels.append(label_ids) + tokenized_inputs["labels"] = labels + return tokenized_inputs + + tokenized_datasets = datasets.map( + tokenize_and_align_labels, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + # cache_file_name= data_args.cache_file_name + ) + + # Data collator + data_collator = DataCollatorForTokenClassification( + tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None + ) + + from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score + + def compute_metrics(p): + predictions, labels = p + # print(predictions.shape, labels.shape) + # if model_args.use_CRF is False: + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens) + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + # results = metric.compute(predictions=true_predictions, references=true_labels) + results = {} + results["overall_precision"] = precision_score(true_labels, true_predictions) + results["overall_recall"] = recall_score(true_labels, true_predictions) + results["overall_f1"] = f1_score(true_labels, true_predictions) + results["overall_accuracy"] = accuracy_score(true_labels, true_predictions) + if data_args.return_entity_level_metrics: + # Unpack nested dictionaries + final_results = {} + for key, value in results.items(): + if isinstance(value, dict): + for n, v in value.items(): + final_results[f"{key}_{n}"] = v + else: + final_results[key] = value + return final_results + else: + return { + "precision": results["overall_precision"], + "recall": results["overall_recall"], + "f1": results["overall_f1"], + "accuracy": results["overall_accuracy"], + } + + # Initialize our Trainer + + trainer = TRAINER_DICT[data_args.task_name]( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"] if training_args.do_train else None, + eval_dataset=tokenized_datasets["validation"] + if training_args.do_eval + else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + if last_checkpoint is not None: + checkpoint = last_checkpoint + elif os.path.isdir(model_args.model_name_or_path): + checkpoint = model_args.model_name_or_path + else: + checkpoint = None + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + output_train_file = os.path.join(training_args.output_dir, "train_results.txt") + if trainer.is_world_process_zero(): + with open(output_train_file, "w") as writer: + logger.info("***** Train results *****") + for key, value in sorted(train_result.metrics.items()): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") + + # Need to save the state, since Trainer.save_model saves only the tokenizer with the model + trainer.state.save_to_json( + os.path.join(training_args.output_dir, "trainer_state.json") + ) + + # Evaluation + results = {} + if training_args.do_eval: + + logger.info("*** Evaluate ***") + + results = trainer.evaluate() + + output_eval_file = os.path.join( + training_args.output_dir, "eval_results_KPE.txt" + ) + if trainer.is_world_process_zero(): + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key, value in results.items(): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") + + def get_kp_from_BIO(examples, prediction): + kps = [] + for i in range(len(prediction)): + ids = examples["input_ids"][i] + + tags = prediction[i] + # print(tags) + current_kps = [] + ckp = [] + for i, tag in enumerate(tags): + id = ids[i] + + if tag == "O" and len(ckp) > 0: + + current_kps.append(ckp) + ckp = [] + elif tag == "B": + # print(ckp, tag) + if tokenizer.convert_ids_to_tokens(id).startswith("##"): + ckp.append(id) + else: + if len(ckp) > 0: + current_kps.append(ckp) + ckp = [] + + ckp.append(id) + # print(ckp, id) + + elif tag == "I" and len(ckp) > 0: + ckp.append(id) + decoded_kps = [] + if len(ckp) > 0: + current_kps.append(ckp) + if len(current_kps) > 0: + decoded_kps = tokenizer.batch_decode( + current_kps, + skip_special_tokens=True, + clean_up_tokenization_spaces=True, + ) + # print(decoded_kps) + kps.append(decoded_kps) + + # examples['predicted_kp']= kps + return kps + + # Predict + if training_args.do_predict: + logger.info("*** Predict ***") + + test_dataset = tokenized_datasets["test"] + predictions, labels, metrics = trainer.predict(test_dataset) + # if model_args.use_CRF is False: + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens) + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + output_test_results_file = os.path.join( + training_args.output_dir, "test_results.txt" + ) + if trainer.is_world_process_zero(): + with open(output_test_results_file, "w") as writer: + for key, value in sorted(metrics.items()): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") + + # Save predictions + output_test_predictions_file = os.path.join( + training_args.output_dir, "test_predictions.txt" + ) + if trainer.is_world_process_zero(): + # # test_dataset['predicted_tags']= true_predictions + # # test_dataset=test_dataset.map(get_kp_from_BIO,batched=True, + # num_proc=data_args.preprocessing_num_workers, + # load_from_cache_file=not data_args.overwrite_cache, + # ) + gen_kps = get_kp_from_BIO(test_dataset, true_predictions) + with open(output_test_predictions_file, "w") as writer: + for prediction in gen_kps: + writer.write(" ".join(prediction) + "\n") + + return results diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py index e69de29..da26337 100644 --- a/dlkp/models/ke/transformer/crf_models.py +++ b/dlkp/models/ke/transformer/crf_models.py @@ -0,0 +1,152 @@ +# all token classification model with crf head +from transformers import ( + AutoModelForPreTraining, + AutoModel, + BertModel, + BertPreTrainedModel, + LongformerModel, + PreTrainedModel, +) +from transformers.modeling_outputs import TokenClassifierOutput +import collections +from crf import ConditionalRandomField +from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel + + +class BERT_CRFforTokenClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # self.crf= nn.Linear(config.num_labels,1) + # self.crf= ConditionalRandomField(self.num_labels) + self.crf = ConditionalRandomField( + self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"} + ) + self.init_weights() + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_hidden_states=None, + output_attentions=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = -self.crf(logits, labels, attention_mask) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + # print(self.crf.transitions) + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def freeze_till_clf(self): + for param in self.bert.parameters(): + param.requires_grad = False + for param in self.dropout.parameters(): + param.requires_grad = False + for param in self.classifier.parameters(): + param.requires_grad = False + + def freeze_encoder_layer(self): + for param in self.bert.parameters(): + param.requires_grad = False + + +class Longformer_CRFforTokenClassification(LongformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.longformer = LongformerModel(config, add_pooling_layer=False) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # self.crf= nn.Linear(config.num_labels,1) + self.crf = ConditionalRandomField(self.num_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_hidden_states=None, + output_attentions=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.longformer( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = -self.crf(logits, labels, attention_mask) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def freeze_till_clf(self): + for param in self.longformer.parameters(): + param.requires_grad = False + for param in self.dropout.parameters(): + param.requires_grad = False + for param in self.classifier.parameters(): + param.requires_grad = False + + def freeze_encoder_layer(self): + for param in self.longformer.parameters(): + param.requires_grad = False diff --git a/dlkp/models/ke/transformer/token_classification_models.py b/dlkp/models/ke/transformer/token_classification_models.py index e69de29..d3b628c 100644 --- a/dlkp/models/ke/transformer/token_classification_models.py +++ b/dlkp/models/ke/transformer/token_classification_models.py @@ -0,0 +1,94 @@ +# all models with token classification only +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +from datasets import ClassLabel, load_dataset, load_metric + +from transformers import ( + AutoConfig, + AutoModelForTokenClassification, + AutoTokenizer, + AutoModel, + DataCollatorForTokenClassification, + HfArgumentParser, + PreTrainedTokenizerFast, + Trainer, + TrainingArguments, + set_seed, + LongformerForTokenClassification, +) +from transformers.trainer_utils import get_last_checkpoint, is_main_process + + +logger = logging.getLogger(__name__) + +from transformers.models.reformer.modeling_reformer import * + + +class ReformerForTokenClassification(ReformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.reformer = ReformerModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + num_hashes=None, + labels=None, + output_hidden_states=None, + output_attentions=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.reformer( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + num_hashes=num_hashes, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, + labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels), + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + # if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + +s diff --git a/notebooks/tranKP.ipynb b/notebooks/tranKP.ipynb index 7835b24..a189ed5 100644 --- a/notebooks/tranKP.ipynb +++ b/notebooks/tranKP.ipynb @@ -1,20 +1,8 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "tranKP.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, "cells": [ { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -22,90 +10,20 @@ "id": "HXDqBqrdaoNw", "outputId": "4e45f4d4-324f-44a0-f289-62479ccd56ef" }, + "outputs": [], "source": [ "!pip install transformers\n", "!pip install sentencepiece\n", "!pip install datasets" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting transformers\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)\n", - "\u001b[K |████████████████████████████████| 1.5MB 4.2MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.19.4)\n", - "Collecting sacremoses\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n", - "\u001b[K |████████████████████████████████| 890kB 17.1MB/s \n", - "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.8)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", - "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8)\n", - "Collecting tokenizers==0.9.4\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)\n", - "\u001b[K |████████████████████████████████| 2.9MB 21.5MB/s \n", - "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.0.0)\n", - "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.12.5)\n", - "Building wheels for collected packages: sacremoses\n", - " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=5eee5bbfe2f9124d4f5d0c0332e4124d253f5989149682918253b6700d942717\n", - " Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n", - "Successfully built sacremoses\n", - "Installing collected packages: sacremoses, tokenizers, transformers\n", - "Successfully installed sacremoses-0.0.43 tokenizers-0.9.4 transformers-4.1.1\n", - "Collecting sentencepiece\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)\n", - "\u001b[K |████████████████████████████████| 1.1MB 5.9MB/s \n", - "\u001b[?25hInstalling collected packages: sentencepiece\n", - "Successfully installed sentencepiece-0.1.94\n", - "Collecting datasets\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ee/78/5873ac1e27bf25a2cbf3447d6704edd3136b1b3ff0eb3bfab38a45d2a1ff/datasets-1.2.0-py3-none-any.whl (159kB)\n", - "\u001b[K |████████████████████████████████| 163kB 4.1MB/s \n", - "\u001b[?25hRequirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from datasets) (0.8)\n", - "Requirement already satisfied: tqdm<4.50.0,>=4.27 in /usr/local/lib/python3.6/dist-packages (from datasets) (4.41.1)\n", - "Collecting xxhash\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)\n", - "\u001b[K |████████████████████████████████| 245kB 6.1MB/s \n", - "\u001b[?25hCollecting pyarrow>=0.17.1\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d7/e1/27958a70848f8f7089bff8d6ebe42519daf01f976d28b481e1bfd52c8097/pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7MB)\n", - "\u001b[K |████████████████████████████████| 17.7MB 1.5MB/s \n", - "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from datasets) (0.3.3)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.6/dist-packages (from datasets) (1.19.4)\n", - "Requirement already satisfied: multiprocess in /usr/local/lib/python3.6/dist-packages (from datasets) (0.70.11.1)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from datasets) (2.23.0)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from datasets) (1.1.5)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2020.12.5)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->datasets) (2018.9)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", - "Installing collected packages: xxhash, pyarrow, datasets\n", - " Found existing installation: pyarrow 0.14.1\n", - " Uninstalling pyarrow-0.14.1:\n", - " Successfully uninstalled pyarrow-0.14.1\n", - "Successfully installed datasets-1.2.0 pyarrow-2.0.0 xxhash-2.0.0\n" - ], - "name": "stdout" - } ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "r3IcbMLKYCz7" }, + "outputs": [], "source": [ "# utils\n", "import os, sys\n", @@ -166,15 +84,15 @@ " default= \"single\",\n", " metadata= {\"help\": \"single | multiple , type of dataset reader to use, split train data into mltiple train file or from single\" }\n", " )" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "TgUthX_SaXRY" }, + "outputs": [], "source": [ "#datset\n", "import os, sys\n", @@ -275,15 +193,15 @@ "\n", "\n", "\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "zL3jsUs2aXUI" }, + "outputs": [], "source": [ "#collate\n", "import os, sys\n", @@ -340,15 +258,15 @@ " prev_output_tokens[:, 0] = decoder_start_tokens\n", " return prev_output_tokens\n", "\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "W8r6v-NJaXWs" }, + "outputs": [], "source": [ "#main\n", "import os, sys\n", @@ -482,15 +400,15 @@ "\n", " \n", "\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "SognzQU6aXY7" }, + "outputs": [], "source": [ "def runner():\n", " args= BasicKPArgs(\n", @@ -513,12 +431,11 @@ "\n", " )\n", " main_fn(args, training_args)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true, @@ -528,12 +445,9 @@ "id": "EMXA4Yi6aXbV", "outputId": "f091bbbd-d403-48a9-8089-ce746dd66cb3" }, - "source": [ - "runner()" - ], - "execution_count": null, "outputs": [ { + "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n", @@ -547,11 +461,9 @@ "Using custom data configuration default\n", "Reusing dataset json (/root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)\n", "Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-ea1a5f71c165584a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-cea35ead7b156669.arrow\n" - ], - "name": "stderr" + ] }, { - "output_type": "display_data", "data": { "text/html": [ "\n", @@ -587,12 +499,13 @@ }, "metadata": { "tags": [] - } + }, + "output_type": "display_data" }, { - "output_type": "error", "ename": "RuntimeError", "evalue": "ignored", + "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", @@ -612,13 +525,18 @@ "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:274] . unexpected pos 1606179904 vs 1606179792" ] } + ], + "source": [ + "runner()" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "5CaaHimjIfOR" }, + "outputs": [], "source": [ "# 3 eval_kp.py\n", "import os, sys\n", @@ -737,15 +655,15 @@ "\n", "\n", "\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "BP6VdOBZIfRi" }, + "outputs": [], "source": [ "args= EvalArgs(\n", " model_type= 't5',\n", @@ -760,42 +678,47 @@ "\n", " )\n", "main_eval(args)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "iU7vhi4nbnmx" }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "joA6Zo0SIfUf" }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "iVByVZnaaXdl" }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "tranKP.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 41a723c79494274593f28d2e5c08951787c8f9a2 Mon Sep 17 00:00:00 2001 From: Amardeep Date: Tue, 25 Jan 2022 00:22:25 +0530 Subject: [PATCH 04/17] add utils --- dlkp/models/ke/crf/__init__.py | 0 dlkp/models/ke/extraction_utils.py | 152 ++++++++++++++++++++++++++ dlkp/models/ke/kpe.py | 168 ++--------------------------- 3 files changed, 159 insertions(+), 161 deletions(-) create mode 100644 dlkp/models/ke/crf/__init__.py diff --git a/dlkp/models/ke/crf/__init__.py b/dlkp/models/ke/crf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py index e69de29..7f34719 100644 --- a/dlkp/models/ke/extraction_utils.py +++ b/dlkp/models/ke/extraction_utils.py @@ -0,0 +1,152 @@ +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_family_name: str = field( + metadata={ + "help": "name of the family of model, bert, longformer, reformer etc." + } + ) + model_name_or_path: str = field( + metadata={ + "help": "Path to pretrained model or model identifier from huggingface.co/models" + } + ) + config_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained config name or path if not the same as model_name" + }, + ) + tokenizer_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained tokenizer name or path if not the same as model_name" + }, + ) + cache_dir: Optional[str] = field( + default=None, + metadata={ + "help": "Where do you want to store the pretrained models downloaded from huggingface.co" + }, + ) + model_revision: str = field( + default="main", + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)." + }, + ) + use_CRF: bool = field( + default=False, + metadata={"help": "wether to use CRF on top of the classifier"}, + ) + use_BiLSTM: bool = field( + default=False, + metadata={"help": "use BiLSTM in sequence classification"}, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + task_name: Optional[str] = field( + default="simple", metadata={"help": "The name of the task simple, crf"} + ) + + train_file: Optional[str] = field( + default=None, + metadata={"help": "The input training data file (a csv or JSON file)."}, + ) + validation_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)." + }, + ) + test_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input test data file to predict on (a csv or JSON file)." + }, + ) + overwrite_cache: bool = field( + default=False, + metadata={"help": "Overwrite the cached training and evaluation sets"}, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + label_all_tokens: bool = field( + default=False, + metadata={ + "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " + "one (in which case the other tokens will have a padding index)." + }, + ) + return_entity_level_metrics: bool = field( + default=False, + metadata={ + "help": "Whether to return all the entity levels during evaluation or just the overall ones." + }, + ) + dataset_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the dataset to use (via the datasets library)."}, + ) + dataset_config_name: Optional[str] = field( + default=None, + metadata={ + "help": "The configuration name of the dataset to use (via the datasets library)." + }, + ) + cache_file_name: Optional[str] = field( + default=None, + metadata={ + "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name." + }, + ) + + def __post_init__(self): + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + ): + raise ValueError( + "Need either a dataset name or a training/validation file." + ) + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in [ + "csv", + "json", + ], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in [ + "csv", + "json", + ], "`validation_file` should be a csv or a json file." + self.task_name = self.task_name.lower() diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index 29e4287..7991062 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -45,181 +45,27 @@ ) from transformers.trainer_utils import get_last_checkpoint, is_main_process +from transformer.crf_models import BERT_CRFforTokenClassification +from transformer.token_classification_models import LongformerForTokenClassification +from crf.crf_trainer import CRF_Trainer +from extraction_utils import ModelArguments, DataTrainingArguments logger = logging.getLogger(__name__) -# from models.long_doc_kp_models import LONG_DOC_KP_MODELS -# KPE_MODELS_DICT={ -# 'others': AutoModelForTokenClassification, -# "longformer": -# 'reformer' : -# 'crf_longformer' : -# 'crf_bert': BERT_CRFforTokenClassification -# } CRF_MODEL_DICT = { "bert": BERT_CRFforTokenClassification, - "longformer": Longformer_CRFforTokenClassification, + # "longformer": Longformer_CRFforTokenClassification, } TOKEN_MODEL_DICT = { "bert": BertForTokenClassification, - "longformer": LongformerForTokenClassification, - "reformer": ReformerForTokenClassification, + # "longformer": LongformerForTokenClassification, + # "reformer": ReformerForTokenClassification, } MODEL_DICT = {"crf": CRF_MODEL_DICT, "simple": TOKEN_MODEL_DICT} -# KPE_MODELS_DICT = KPE_MODELS_DICT | LONG_DOC_KP_MODELS - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - - model_family_name: str = field( - metadata={ - "help": "name of the family of model, bert, longformer, reformer etc." - } - ) - model_name_or_path: str = field( - metadata={ - "help": "Path to pretrained model or model identifier from huggingface.co/models" - } - ) - config_name: Optional[str] = field( - default=None, - metadata={ - "help": "Pretrained config name or path if not the same as model_name" - }, - ) - tokenizer_name: Optional[str] = field( - default=None, - metadata={ - "help": "Pretrained tokenizer name or path if not the same as model_name" - }, - ) - cache_dir: Optional[str] = field( - default=None, - metadata={ - "help": "Where do you want to store the pretrained models downloaded from huggingface.co" - }, - ) - model_revision: str = field( - default="main", - metadata={ - "help": "The specific model version to use (can be a branch name, tag name or commit id)." - }, - ) - use_CRF: bool = field( - default=False, - metadata={"help": "wether to use CRF on top of the classifier"}, - ) - use_BiLSTM: bool = field( - default=False, - metadata={"help": "use BiLSTM in sequence classification"}, - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - task_name: Optional[str] = field( - default="simple", metadata={"help": "The name of the task simple, crf"} - ) - - train_file: Optional[str] = field( - default=None, - metadata={"help": "The input training data file (a csv or JSON file)."}, - ) - validation_file: Optional[str] = field( - default=None, - metadata={ - "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)." - }, - ) - test_file: Optional[str] = field( - default=None, - metadata={ - "help": "An optional input test data file to predict on (a csv or JSON file)." - }, - ) - overwrite_cache: bool = field( - default=False, - metadata={"help": "Overwrite the cached training and evaluation sets"}, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": "Whether to pad all samples to model maximum sentence length. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " - "efficient on GPU but very bad for TPU." - }, - ) - label_all_tokens: bool = field( - default=False, - metadata={ - "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " - "one (in which case the other tokens will have a padding index)." - }, - ) - return_entity_level_metrics: bool = field( - default=False, - metadata={ - "help": "Whether to return all the entity levels during evaluation or just the overall ones." - }, - ) - dataset_name: Optional[str] = field( - default=None, - metadata={"help": "The name of the dataset to use (via the datasets library)."}, - ) - dataset_config_name: Optional[str] = field( - default=None, - metadata={ - "help": "The configuration name of the dataset to use (via the datasets library)." - }, - ) - cache_file_name: Optional[str] = field( - default=None, - metadata={ - "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name." - }, - ) - - def __post_init__(self): - if ( - self.dataset_name is None - and self.train_file is None - and self.validation_file is None - ): - raise ValueError( - "Need either a dataset name or a training/validation file." - ) - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in [ - "csv", - "json", - ], "`train_file` should be a csv or a json file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in [ - "csv", - "json", - ], "`validation_file` should be a csv or a json file." - self.task_name = self.task_name.lower() - - -# def main(): TRAINER_DICT = { "crf": CRF_Trainer, "simple": Trainer, From 3bf5026462728e60544791973f894b177d47414c Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Wed, 26 Jan 2022 21:51:26 +0530 Subject: [PATCH 05/17] f formatting and re arch --- dlkp/kp_metrics/__init__.py | 0 dlkp/models/ke/kpe.py | 250 +- examples/dataset/hf_data_script.py | 80 +- examples/dataset/hf_data_script_long_docs.py | 116 +- examples/ke/ke_sequence_tagging.py | 156 +- ldkp_amardeep.py | 2919 ++++++++++++++++++ 6 files changed, 3300 insertions(+), 221 deletions(-) create mode 100644 dlkp/kp_metrics/__init__.py create mode 100644 ldkp_amardeep.py diff --git a/dlkp/kp_metrics/__init__.py b/dlkp/kp_metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index 7991062..556048b 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -75,8 +75,7 @@ def main_run_kpe(model_args, data_args, training_args): # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. + # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): @@ -112,8 +111,9 @@ def main_run_kpe(model_args, data_args, training_args): handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel( - logging.INFO if is_main_process(training_args.local_rank) else logging.WARN + logging.INFO if is_main_process(training_args.local_rank) else logging.INFO ) + # logger.set_global_logging_level(logging.INFO) # Log on each process the small summary: logger.warning( @@ -147,11 +147,13 @@ def main_run_kpe(model_args, data_args, training_args): data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file - extension = data_args.train_file.split(".")[-1] + extension = data_args.test_file.split(".")[-1] datasets = load_dataset( extension, data_files=data_files ) ##CR get dataset in here @@ -182,7 +184,11 @@ def get_label_list(labels): # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: - label_list = get_label_list(datasets["train"][label_column_name]) + label_list = get_label_list( + datasets["train"][label_column_name] + if training_args.do_train + else datasets["validation"][label_column_name] + ) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) print(label_to_id) @@ -218,9 +224,9 @@ def get_label_list(labels): config=config, cache_dir=model_args.cache_dir, ) - model.freeze_encoder_layer() + # model.freeze_encoder_layer() print("model") - print(model) + # print(model) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -276,7 +282,12 @@ def tokenize_and_align_labels(examples): previous_word_idx = word_idx labels.append(label_ids) + if data_args.task_name == "guided": + tokenized_inputs["guide_embed"] = examples["guide_embed"] tokenized_inputs["labels"] = labels + # tokenized_inputs['paper_id']= examples['paper_id'] + # tokenized_inputs['extractive_keyphrases']= examples['extractive_keyphrases'] + return tokenized_inputs tokenized_datasets = datasets.map( @@ -293,6 +304,7 @@ def tokenize_and_align_labels(examples): ) from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score + from seqeval.scheme import IOB2, IOB1 def compute_metrics(p): predictions, labels = p @@ -312,13 +324,22 @@ def compute_metrics(p): # results = metric.compute(predictions=true_predictions, references=true_labels) results = {} - results["overall_precision"] = precision_score(true_labels, true_predictions) - results["overall_recall"] = recall_score(true_labels, true_predictions) - results["overall_f1"] = f1_score(true_labels, true_predictions) + # print("cal precisi") + results["overall_precision"] = precision_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + results["overall_recall"] = recall_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + # print("cal f1") + results["overall_f1"] = f1_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) results["overall_accuracy"] = accuracy_score(true_labels, true_predictions) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} + # print("cal entity level mat") for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): @@ -335,6 +356,40 @@ def compute_metrics(p): } # Initialize our Trainer + # metric = load_metric("seqeval") + + # def compute_metrics(p): + # predictions, labels = p + # predictions = np.argmax(predictions, axis=2) + + # # Remove ignored index (special tokens) + # true_predictions = [ + # [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + # for prediction, label in zip(predictions, labels) + # ] + # true_labels = [ + # [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + # for prediction, label in zip(predictions, labels) + # ] + + # results = metric.compute(predictions=true_predictions, references=true_labels) + # if data_args.return_entity_level_metrics: + # # Unpack nested dictionaries + # final_results = {} + # for key, value in results.items(): + # if isinstance(value, dict): + # for n, v in value.items(): + # final_results[f"{key}_{n}"] = v + # else: + # final_results[key] = value + # return final_results + # else: + # return { + # "precision": results["overall_precision"], + # "recall": results["overall_recall"], + # "f1": results["overall_f1"], + # "accuracy": results["overall_accuracy"], + # } trainer = TRAINER_DICT[data_args.task_name]( model=model, @@ -374,66 +429,19 @@ def compute_metrics(p): # Evaluation results = {} - if training_args.do_eval: + # if training_args.do_eval: - logger.info("*** Evaluate ***") + # logger.info("*** Evaluate ***") - results = trainer.evaluate() - - output_eval_file = os.path.join( - training_args.output_dir, "eval_results_KPE.txt" - ) - if trainer.is_world_process_zero(): - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key, value in results.items(): - logger.info(f" {key} = {value}") - writer.write(f"{key} = {value}\n") + # results = trainer.evaluate() - def get_kp_from_BIO(examples, prediction): - kps = [] - for i in range(len(prediction)): - ids = examples["input_ids"][i] - - tags = prediction[i] - # print(tags) - current_kps = [] - ckp = [] - for i, tag in enumerate(tags): - id = ids[i] - - if tag == "O" and len(ckp) > 0: - - current_kps.append(ckp) - ckp = [] - elif tag == "B": - # print(ckp, tag) - if tokenizer.convert_ids_to_tokens(id).startswith("##"): - ckp.append(id) - else: - if len(ckp) > 0: - current_kps.append(ckp) - ckp = [] - - ckp.append(id) - # print(ckp, id) - - elif tag == "I" and len(ckp) > 0: - ckp.append(id) - decoded_kps = [] - if len(ckp) > 0: - current_kps.append(ckp) - if len(current_kps) > 0: - decoded_kps = tokenizer.batch_decode( - current_kps, - skip_special_tokens=True, - clean_up_tokenization_spaces=True, - ) - # print(decoded_kps) - kps.append(decoded_kps) - - # examples['predicted_kp']= kps - return kps + # output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt") + # if trainer.is_world_process_zero(): + # with open(output_eval_file, "w") as writer: + # logger.info("***** Eval results *****") + # for key, value in results.items(): + # logger.info(f" {key} = {value}") + # writer.write(f"{key} = {value}\n") # Predict if training_args.do_predict: @@ -449,6 +457,10 @@ def get_kp_from_BIO(examples, prediction): [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] output_test_results_file = os.path.join( training_args.output_dir, "test_results.txt" @@ -460,18 +472,102 @@ def get_kp_from_BIO(examples, prediction): writer.write(f"{key} = {value}\n") # Save predictions + def get_kp_from_BIO(examples, i): + # kps= [] + # for i in range(len(prediction)): + ids = examples["input_ids"] + # print(examples.keys()) + + # print(tags) + def mmkp(tag_): + current_kps = [] + ckp = [] + prev_tag = None + for j, tag in enumerate(tag_): + id = ids[j] + + if tag == "O" and len(ckp) > 0: + + current_kps.append(ckp) + ckp = [] + elif tag == "B": + # print(ckp, tag) + if ( + tokenizer.convert_ids_to_tokens(id).startswith("##") + or prev_tag == "B" + ): + ckp.append(id) + else: + if len(ckp) > 0: + current_kps.append(ckp) + ckp = [] + + ckp.append(id) + # print(ckp, id) + + elif tag == "I" and len(ckp) > 0: + ckp.append(id) + prev_tag = tag + decoded_kps = [] + if len(ckp) > 0: + current_kps.append(ckp) + if len(current_kps) > 0: + decoded_kps = tokenizer.batch_decode( + current_kps, + skip_special_tokens=True, + clean_up_tokenization_spaces=True, + ) + # print(decoded_kps) + return decoded_kps + + tags = true_predictions[i] + decoded_kps = mmkp(tags) + + ttgs = true_labels[i] + eekp = mmkp(ttgs) + + # examples['kp_predicted']= decoded_kps + examples["kp_predicted"] = list(dict.fromkeys(decoded_kps)) + examples["eekp"] = list(dict.fromkeys(eekp)) + # examples['eekp']= eekp + # else: + # examples['kp_predicted']= [''] + examples["id"] = i + return examples + + import pandas as pd + output_test_predictions_file = os.path.join( - training_args.output_dir, "test_predictions.txt" + training_args.output_dir, "test_predictions.csv" + ) + output_test_predictions_BIO_file = os.path.join( + training_args.output_dir, "test_predictions_BIO.txt" ) if trainer.is_world_process_zero(): - # # test_dataset['predicted_tags']= true_predictions - # # test_dataset=test_dataset.map(get_kp_from_BIO,batched=True, - # num_proc=data_args.preprocessing_num_workers, - # load_from_cache_file=not data_args.overwrite_cache, - # ) - gen_kps = get_kp_from_BIO(test_dataset, true_predictions) - with open(output_test_predictions_file, "w") as writer: - for prediction in gen_kps: - writer.write(" ".join(prediction) + "\n") + print(test_dataset, len(test_dataset["paper_id"])) + ppid = test_dataset["paper_id"] + # ekp= test_dataset['extractive_keyphrases'] + + test_dataset = test_dataset.map( + get_kp_from_BIO, + num_proc=data_args.preprocessing_num_workers, + with_indices=True, + ) + # input_columns= ['paper_id','input_ids','extractive_keyphrases'] + print(test_dataset, " agian") + df = pd.DataFrame.from_dict( + { + "id": ppid, + "extractive_keyphrase": test_dataset["eekp"], + "keyphrases": test_dataset["kp_predicted"], + } + ) + df.to_csv(output_test_predictions_file, index=False) + + # get BIO tag files + + with open(output_test_predictions_BIO_file, "w") as writer: + for prediction in true_predictions: + writer.write(" ".join(prediction) + "\n") return results diff --git a/examples/dataset/hf_data_script.py b/examples/dataset/hf_data_script.py index 4355a78..a9404c2 100644 --- a/examples/dataset/hf_data_script.py +++ b/examples/dataset/hf_data_script.py @@ -27,11 +27,7 @@ # TODO: Add link to the official dataset URLs here -_URLS = { - "test": "test.jsonl", - "train": "train.jsonl", - "valid": "valid.jsonl" -} +_URLS = {"test": "test.jsonl", "train": "train.jsonl", "valid": "valid.jsonl"} # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case @@ -41,23 +37,36 @@ class KPTimes(datasets.GeneratorBasedBuilder): VERSION = datasets.Version("0.0.1") BUILDER_CONFIGS = [ - datasets.BuilderConfig(name="extraction", version=VERSION, - description="This part of my dataset covers extraction"), - datasets.BuilderConfig(name="generation", version=VERSION, - description="This part of my dataset covers generation"), - datasets.BuilderConfig(name="raw", version=VERSION, description="This part of my dataset covers the raw dataset"), + datasets.BuilderConfig( + name="extraction", + version=VERSION, + description="This part of my dataset covers extraction", + ), + datasets.BuilderConfig( + name="generation", + version=VERSION, + description="This part of my dataset covers generation", + ), + datasets.BuilderConfig( + name="raw", + version=VERSION, + description="This part of my dataset covers the raw dataset", + ), ] DEFAULT_CONFIG_NAME = "extraction" def _info(self): - if self.config.name == "extraction": # This is the name of the configuration selected in BUILDER_CONFIGS above + if ( + self.config.name == "extraction" + ): # This is the name of the configuration selected in BUILDER_CONFIGS above features = datasets.Features( { "id": datasets.Value("string"), "document": datasets.features.Sequence(datasets.Value("string")), - "doc_bio_tags": datasets.features.Sequence(datasets.Value("string")) - + "doc_bio_tags": datasets.features.Sequence( + datasets.Value("string") + ), } ) elif self.config.name == "generation": @@ -65,9 +74,12 @@ def _info(self): { "id": datasets.Value("string"), "document": datasets.features.Sequence(datasets.Value("string")), - "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")), - "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string")) - + "extractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), + "abstractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), } ) else: @@ -75,9 +87,15 @@ def _info(self): { "id": datasets.Value("string"), "document": datasets.features.Sequence(datasets.Value("string")), - "doc_bio_tags": datasets.features.Sequence(datasets.Value("string")), - "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")), - "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string")), + "doc_bio_tags": datasets.features.Sequence( + datasets.Value("string") + ), + "extractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), + "abstractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), "other_metadata": datasets.features.Sequence( { "id": datasets.Value("string"), @@ -87,8 +105,7 @@ def _info(self): "abstract": datasets.Value("string"), "keyword": datasets.Value("string"), } - ) - + ), } ) return datasets.DatasetInfo( @@ -111,23 +128,20 @@ def _split_generators(self, dl_manager): name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ - "filepath": data_dir['train'], + "filepath": data_dir["train"], "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": data_dir['test'], - "split": "test" - }, + gen_kwargs={"filepath": data_dir["test"], "split": "test"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ - "filepath": data_dir['valid'], + "filepath": data_dir["valid"], "split": "valid", }, ), @@ -141,23 +155,23 @@ def _generate_examples(self, filepath, split): if self.config.name == "extraction": # Yields examples as (key, example) tuples yield key, { - "id": data.get('paper_id'), + "id": data.get("paper_id"), "document": data["document"], - "doc_bio_tags": data.get("doc_bio_tags") + "doc_bio_tags": data.get("doc_bio_tags"), } elif self.config.name == "generation": yield key, { - "id": data.get('paper_id'), + "id": data.get("paper_id"), "document": data["document"], "extractive_keyphrases": data.get("extractive_keyphrases"), - "abstractive_keyphrases": data.get("abstractive_keyphrases") + "abstractive_keyphrases": data.get("abstractive_keyphrases"), } else: yield key, { - "id": data.get('paper_id'), + "id": data.get("paper_id"), "document": data["document"], "doc_bio_tags": data.get("doc_bio_tags"), "extractive_keyphrases": data.get("extractive_keyphrases"), "abstractive_keyphrases": data.get("abstractive_keyphrases"), - "other_metadata": data["other_metadata"] + "other_metadata": data["other_metadata"], } diff --git a/examples/dataset/hf_data_script_long_docs.py b/examples/dataset/hf_data_script_long_docs.py index cee1e22..d9ff5cc 100644 --- a/examples/dataset/hf_data_script_long_docs.py +++ b/examples/dataset/hf_data_script_long_docs.py @@ -22,11 +22,7 @@ # TODO: Add link to the official dataset URLs here -_URLS = { - "test": "test.jsonl", - "train": "train.jsonl", - "valid": "valid.jsonl" -} +_URLS = {"test": "test.jsonl", "train": "train.jsonl", "valid": "valid.jsonl"} # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case @@ -36,28 +32,46 @@ class TestLDKP(datasets.GeneratorBasedBuilder): VERSION = datasets.Version("0.1") BUILDER_CONFIGS = [ - datasets.BuilderConfig(name="extraction", version=VERSION, - description="This part of my dataset covers extraction"), - datasets.BuilderConfig(name="generation", version=VERSION, - description="This part of my dataset covers generation"), - datasets.BuilderConfig(name="raw", version=VERSION, description="This part of my dataset covers the raw dataset"), - datasets.BuilderConfig(name="ldkp_generation", version=VERSION, - description="This part of my dataset covers abstract only"), - datasets.BuilderConfig(name="ldkp_extraction", version=VERSION, - description="This part of my dataset covers abstract only"), - + datasets.BuilderConfig( + name="extraction", + version=VERSION, + description="This part of my dataset covers extraction", + ), + datasets.BuilderConfig( + name="generation", + version=VERSION, + description="This part of my dataset covers generation", + ), + datasets.BuilderConfig( + name="raw", + version=VERSION, + description="This part of my dataset covers the raw dataset", + ), + datasets.BuilderConfig( + name="ldkp_generation", + version=VERSION, + description="This part of my dataset covers abstract only", + ), + datasets.BuilderConfig( + name="ldkp_extraction", + version=VERSION, + description="This part of my dataset covers abstract only", + ), ] DEFAULT_CONFIG_NAME = "extraction" def _info(self): - if self.config.name == "extraction" or self.config.name == "ldkp_extraction": # This is the name of the configuration selected in BUILDER_CONFIGS above + if ( + self.config.name == "extraction" or self.config.name == "ldkp_extraction" + ): # This is the name of the configuration selected in BUILDER_CONFIGS above features = datasets.Features( { "id": datasets.Value("int64"), "document": datasets.features.Sequence(datasets.Value("string")), - "doc_bio_tags": datasets.features.Sequence(datasets.Value("string")) - + "doc_bio_tags": datasets.features.Sequence( + datasets.Value("string") + ), } ) elif self.config.name == "generation" or self.config.name == "ldkp_generation": @@ -65,9 +79,12 @@ def _info(self): { "id": datasets.Value("int64"), "document": datasets.features.Sequence(datasets.Value("string")), - "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")), - "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string")) - + "extractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), + "abstractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), } ) else: @@ -75,16 +92,25 @@ def _info(self): { "id": datasets.Value("int64"), "document": datasets.features.Sequence(datasets.Value("string")), - "doc_bio_tags": datasets.features.Sequence(datasets.Value("string")), - "extractive_keyphrases": datasets.features.Sequence(datasets.Value("string")), - "abstractive_keyphrases": datasets.features.Sequence(datasets.Value("string")), + "doc_bio_tags": datasets.features.Sequence( + datasets.Value("string") + ), + "extractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), + "abstractive_keyphrases": datasets.features.Sequence( + datasets.Value("string") + ), "other_metadata": datasets.features.Sequence( { - "text": datasets.features.Sequence(datasets.Value("string")), - "bio_tags": datasets.features.Sequence(datasets.Value("string")) + "text": datasets.features.Sequence( + datasets.Value("string") + ), + "bio_tags": datasets.features.Sequence( + datasets.Value("string") + ), } - ) - + ), } ) return datasets.DatasetInfo( @@ -107,23 +133,20 @@ def _split_generators(self, dl_manager): name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ - "filepath": data_dir['train'], + "filepath": data_dir["train"], "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": data_dir['test'], - "split": "test" - }, + gen_kwargs={"filepath": data_dir["test"], "split": "test"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ - "filepath": data_dir['valid'], + "filepath": data_dir["valid"], "split": "valid", }, ), @@ -137,36 +160,37 @@ def _generate_examples(self, filepath, split): if self.config.name == "extraction": # Yields examples as (key, example) tuples yield key, { - "id": data['paper_id'], + "id": data["paper_id"], "document": data["document"], - "doc_bio_tags": data["doc_bio_tags"] + "doc_bio_tags": data["doc_bio_tags"], } elif self.config.name == "ldkp_extraction": yield key, { - "id": data['paper_id'], - "document": data["document"] + data["other_metadata"]['text'], - "doc_bio_tags": data["document_tags"] + data["other_metadata"]['bio_tags'] + "id": data["paper_id"], + "document": data["document"] + data["other_metadata"]["text"], + "doc_bio_tags": data["document_tags"] + + data["other_metadata"]["bio_tags"], } elif self.config.name == "ldkp_generation": yield key, { - "id": data['paper_id'], - "document": data["document"] + data["other_metadata"]['text'], + "id": data["paper_id"], + "document": data["document"] + data["other_metadata"]["text"], "extractive_keyphrases": data["extractive_keyphrases"], - "abstractive_keyphrases": data["abstractive_keyphrases"] + "abstractive_keyphrases": data["abstractive_keyphrases"], } elif self.config.name == "generation": yield key, { - "id": data['paper_id'], + "id": data["paper_id"], "document": data["document"], "extractive_keyphrases": data["extractive_keyphrases"], - "abstractive_keyphrases": data["abstractive_keyphrases"] + "abstractive_keyphrases": data["abstractive_keyphrases"], } else: yield key, { - "id": data['paper_id'], + "id": data["paper_id"], "document": data["document"], "doc_bio_tags": data["doc_bio_tags"], "extractive_keyphrases": data["extractive_keyphrases"], "abstractive_keyphrases": data["abstractive_keyphrases"], - "other_metadata": data["other_metadata"] + "other_metadata": data["other_metadata"], } diff --git a/examples/ke/ke_sequence_tagging.py b/examples/ke/ke_sequence_tagging.py index 8cdbdc0..01f68ac 100644 --- a/examples/ke/ke_sequence_tagging.py +++ b/examples/ke/ke_sequence_tagging.py @@ -16,7 +16,7 @@ from transformers import get_linear_schedule_with_warmup # We'll need the BertTokenizer for doing sequence tagging with Bert -tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") def get_device(): @@ -26,22 +26,22 @@ def get_device(): # Tell PyTorch to use the GPU. device = torch.device("cuda") - print('There are %d GPU(s) available.' % torch.cuda.device_count()) + print("There are %d GPU(s) available." % torch.cuda.device_count()) - print('We will use the GPU:', torch.cuda.get_device_name(0)) + print("We will use the GPU:", torch.cuda.get_device_name(0)) # If not... else: - print('No GPU available, using the CPU instead.') + print("No GPU available, using the CPU instead.") device = torch.device("cpu") return device def format_time(elapsed): - ''' + """ Takes a time in seconds and returns a string hh:mm:ss - ''' + """ # Round to the nearest second. elapsed_rounded = int(round((elapsed))) @@ -118,7 +118,7 @@ def prepare_tokenized_input(sentences): for sent in sentences: # Reconstruct the sentence--otherwise `tokenizer` will interpret the list # of string tokens as having already been tokenized by BERT. - sent_str = ' '.join(sent) + sent_str = " ".join(sent) # `encode_plus` will: # (1) Tokenize the sentence. @@ -133,19 +133,19 @@ def prepare_tokenized_input(sentences): max_length=50, # Pad & truncate all sentences. pad_to_max_length=True, return_attention_mask=True, # Construct attn. masks. - return_tensors='pt', # Return pytorch tensors. + return_tensors="pt", # Return pytorch tensors. ) # Add the encoded sentence to the list. - input_ids.append(encoded_dict['input_ids'][0]) + input_ids.append(encoded_dict["input_ids"][0]) # And its attention mask (simply differentiates padding from non-padding). - attention_masks.append(encoded_dict['attention_mask'][0]) + attention_masks.append(encoded_dict["attention_mask"][0]) # Print sentence 0, now as a list of IDs. - print('Original: ', sentences[0]) - print('Token IDs:', input_ids[0]) - print('Masks:', attention_masks[0]) + print("Original: ", sentences[0]) + print("Token IDs:", input_ids[0]) + print("Masks:", attention_masks[0]) return input_ids, attention_masks @@ -187,15 +187,17 @@ def add_null_labels(input_ids, labels, label_map): token_id = token_id.numpy().item() # If `[PAD]`, `[CLS]`, or `[SEP]`... - if (token_id == tokenizer.pad_token_id) or \ - (token_id == tokenizer.cls_token_id) or \ - (token_id == tokenizer.sep_token_id): + if ( + (token_id == tokenizer.pad_token_id) + or (token_id == tokenizer.cls_token_id) + or (token_id == tokenizer.sep_token_id) + ): # Assign it the null label. padded_labels.append(null_label_id) # If the token string starts with "##"... - elif tokenizer.ids_to_tokens[token_id][0:2] == '##': + elif tokenizer.ids_to_tokens[token_id][0:2] == "##": # It's a subword token, and not part of the original dataset, so # assign it the null label. @@ -218,7 +220,7 @@ def add_null_labels(input_ids, labels, label_map): # If we did this right, then the new `padded_labels` list should match # the length of the tokenized sentence. - assert (len(sen) == len(padded_labels)) + assert len(sen) == len(padded_labels) # Store the updated labels list for this sentence. new_labels.append(padded_labels) @@ -271,8 +273,8 @@ def train_model(train_data, valid_data, device): # Perform one full pass over the training set. print("") - print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) - print('Training...') + print("======== Epoch {:} / {:} ========".format(epoch_i + 1, epochs)) + print("Training...") # Measure how long the training epoch takes. t0 = time.time() @@ -294,7 +296,11 @@ def train_model(train_data, valid_data, device): elapsed = format_time(time.time() - t0) # Report progress. - print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_data), elapsed)) + print( + " Batch {:>5,} of {:>5,}. Elapsed: {:}.".format( + step, len(train_data), elapsed + ) + ) # Unpack this training batch from our dataloader. # @@ -321,10 +327,12 @@ def train_model(train_data, valid_data, device): # https://huggingface.co/transformers/model_doc/bert.html#bertfortokenclassification # The results are returned in a results object, documented here: # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.TokenClassifierOutput - result = model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask, - labels=b_labels) + result = model( + b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask, + labels=b_labels, + ) loss = result.loss @@ -363,14 +371,14 @@ def train_model(train_data, valid_data, device): print("Training complete!") # Use plot styling from seaborn. - sns.set(style='darkgrid') + sns.set(style="darkgrid") # Increase the plot size and font size. sns.set(font_scale=1.5) - plt.rcParams["figure.figsize"] = (12,6) + plt.rcParams["figure.figsize"] = (12, 6) # Plot the learning curve. - plt.plot(loss_values, 'b-o') + plt.plot(loss_values, "b-o") # Label the plot. plt.title("Training loss") @@ -393,9 +401,9 @@ def train_model(train_data, valid_data, device): # add the null labels for special tokens like [SEP], [CLS], etc final_train_labels = add_null_labels(train_token_ids, train_labels, label_mapping) # convert the processed dataset to tensors - pt_train_token_ids, pt_train_attention_masks, pt_train_labels = convert_to_tensors(train_token_ids, - train_attention_masks, - final_train_labels) + pt_train_token_ids, pt_train_attention_masks, pt_train_labels = convert_to_tensors( + train_token_ids, train_attention_masks, final_train_labels + ) # process the validation dataset # parse the conll format @@ -405,9 +413,9 @@ def train_model(train_data, valid_data, device): # add the null labels for special tokens like [SEP], [CLS], etc final_valid_labels = add_null_labels(valid_token_ids, valid_labels, label_mapping) # convert the processed dataset to tensors - pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels = convert_to_tensors(valid_token_ids, - valid_attention_masks, - final_valid_labels) + pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels = convert_to_tensors( + valid_token_ids, valid_attention_masks, final_valid_labels + ) # process the test dataset # parse the conll format test_sentences, test_labels, _ = parse_conll("test.txt") @@ -416,16 +424,22 @@ def train_model(train_data, valid_data, device): # add the null labels for special tokens like [SEP], [CLS], etc final_test_labels = add_null_labels(test_token_ids, test_labels, label_mapping) # convert the processed dataset to tensors - pt_test_token_ids, pt_test_attention_masks, pt_test_labels = convert_to_tensors(test_token_ids, - test_attention_masks, - final_test_labels) + pt_test_token_ids, pt_test_attention_masks, pt_test_labels = convert_to_tensors( + test_token_ids, test_attention_masks, final_test_labels + ) # Convert the training inputs into a TensorDataset. - train_dataset = TensorDataset(pt_train_token_ids, pt_train_attention_masks, pt_train_labels) + train_dataset = TensorDataset( + pt_train_token_ids, pt_train_attention_masks, pt_train_labels + ) # Convert the validation inputs into a TensorDataset. - valid_dataset = TensorDataset(pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels) + valid_dataset = TensorDataset( + pt_valid_token_ids, pt_valid_attention_masks, pt_valid_labels + ) # Convert the test inputs into a TensorDataset. - test_dataset = TensorDataset(pt_test_token_ids, pt_test_attention_masks, pt_test_labels) + test_dataset = TensorDataset( + pt_test_token_ids, pt_test_attention_masks, pt_test_labels + ) # The DataLoader needs to know our batch size for training, so we specify it # here. For fine-tuning BERT on a specific task, the authors recommend a batch @@ -437,20 +451,21 @@ def train_model(train_data, valid_data, device): train_dataloader = DataLoader( train_dataset, # The training samples. sampler=RandomSampler(train_dataset), # Select batches randomly - batch_size=batch_size # Trains with this batch size. + batch_size=batch_size, # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader = DataLoader( valid_dataset, # The validation samples. sampler=SequentialSampler(valid_dataset), # Pull out batches sequentially. - batch_size=batch_size # Evaluate with this batch size. + batch_size=batch_size, # Evaluate with this batch size. ) # Load BertForTokenClassification model = BertForTokenClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. - num_labels=len(label_mapping) + 1, # The number of output labels--18 for our NER dataset + num_labels=len(label_mapping) + + 1, # The number of output labels--18 for our NER dataset output_attentions=False, # Whether the model returns attentions weights. output_hidden_states=False, # Whether the model returns all hidden-states. ) @@ -459,10 +474,9 @@ def train_model(train_data, valid_data, device): model.cuda() # Load the AdamW optimizer - optimizer = AdamW(model.parameters(), - lr=5e-5, # args.learning_rate - eps=1e-8 # args.adam_epsilon - ) + optimizer = AdamW( + model.parameters(), lr=5e-5, eps=1e-8 # args.learning_rate # args.adam_epsilon + ) # Number of training epochs epochs = 4 @@ -471,26 +485,30 @@ def train_model(train_data, valid_data, device): total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. - scheduler = get_linear_schedule_with_warmup(optimizer, - num_warmup_steps=0, - num_training_steps=total_steps) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=0, num_training_steps=total_steps + ) device = get_device() - train_model(train_data=train_dataloader, - valid_data=validation_dataloader, - device=device) + train_model( + train_data=train_dataloader, valid_data=validation_dataloader, device=device + ) # Prediction on test set # Set the batch size. batch_size = 32 # Create the DataLoader. - prediction_data = TensorDataset(pt_test_token_ids, pt_test_attention_masks, pt_test_labels) + prediction_data = TensorDataset( + pt_test_token_ids, pt_test_attention_masks, pt_test_labels + ) prediction_sampler = SequentialSampler(prediction_data) - prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) + prediction_dataloader = DataLoader( + prediction_data, sampler=prediction_sampler, batch_size=batch_size + ) - print('Predicting labels for {:,} test sentences...'.format(len(pt_test_token_ids))) + print("Predicting labels for {:,} test sentences...".format(len(pt_test_token_ids))) # Put model in evaluation mode model.eval() @@ -510,22 +528,24 @@ def train_model(train_data, valid_data, device): # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions - result = model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask, - return_dict=True) + result = model( + b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask, + return_dict=True, + ) logits = result.logits # Move logits and labels to CPU logits = logits.detach().cpu().numpy() - label_ids = b_labels.to('cpu').numpy() + label_ids = b_labels.to("cpu").numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) - print(' DONE.') + print(" DONE.") # First, combine the results across the batches. all_predictions = np.concatenate(predictions, axis=0) @@ -569,11 +589,17 @@ def train_model(train_data, valid_data, device): real_token_predictions.append(predicted_label_ids[i]) real_token_labels.append(all_true_labels[i]) - print("Before filtering out `null` tokens, length = {:,}".format(len(all_true_labels))) - print(" After filtering out `null` tokens, length = {:,}".format(len(real_token_labels))) + print( + "Before filtering out `null` tokens, length = {:,}".format(len(all_true_labels)) + ) + print( + " After filtering out `null` tokens, length = {:,}".format( + len(real_token_labels) + ) + ) # Calculate the F1 score. Because this is a multi-class problem, we have # to set the `average` parameter. TODO - What does `micro` do? - f1 = f1_score(real_token_labels, real_token_predictions, average='micro') + f1 = f1_score(real_token_labels, real_token_predictions, average="micro") print("F1 score: {:.2%}".format(f1)) diff --git a/ldkp_amardeep.py b/ldkp_amardeep.py new file mode 100644 index 0000000..5ba5b61 --- /dev/null +++ b/ldkp_amardeep.py @@ -0,0 +1,2919 @@ +# -*- coding: utf-8 -*- +"""long-document-kp Amardeep.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1GNzTFF75dQUrgXOiSteZ59S5tt6odWgZ + +# **this notebook is under devlopement don't run or change anything** +""" + +# !pip install transformers +# !pip install datasets +# !pip install seqeval +# !pip install flair + +"""# trim dataset function + +""" + + +def trim_file(fin, fout=None, n=10000): + import json + + if fout is None: + fout = fin[:-5] + str(n) + ".json" + with open(fin, "r") as fi: + with open(fout, "w") as fo: + for x in fi: + if n > 0: + fo.write(x) + n -= 1 + + +# trim_file("/content/drive/MyDrive/long_document_ke/train.json", n=5000) + +# trim_file("/content/drive/MyDrive/long_document_ke/text_rank_conll_kp20_proc.json", n=500) + +"""# Reformer for token classification""" + +# long_doc_kp_models.py +# all long docu,emt modesl realted to KP +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# or enter +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for token classification. +""" +# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as +# comments. + +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +# logging.set_global_logging_level(logging.INFO) +import numpy as np +from datasets import ClassLabel, load_dataset, load_metric + +import transformers +from transformers import ( + AutoConfig, + AutoModelForTokenClassification, + AutoTokenizer, + AutoModel, + DataCollatorForTokenClassification, + HfArgumentParser, + PreTrainedTokenizerFast, + Trainer, + TrainingArguments, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint, is_main_process + + +logger = logging.getLogger(__name__) + +from transformers.models.reformer.modeling_reformer import * +from transformers import ( + LongformerForTokenClassification, + # BigBirdForTokenClassification +) + + +class ReformerForTokenClassification(ReformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.reformer = ReformerModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + num_hashes=None, + labels=None, + output_hidden_states=None, + output_attentions=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.reformer( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + num_hashes=num_hashes, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, + labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels), + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + # if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + +LONG_DOC_KP_MODELS = { + "longformer": LongformerForTokenClassification, + "reformer": ReformerForTokenClassification, +} + + +"""#CRF module + +## crf algo utils +""" + +""" +Conditional random field +""" +from typing import List, Tuple, Dict, Union + +import torch + +# from allennlp.common.checks import ConfigurationError +# import allennlp.nn.util as util + +VITERBI_DECODING = Tuple[List[int], float] # a list of tags, and a viterbi score + + +def allowed_transitions( + constraint_type: str, labels: Dict[int, str] +) -> List[Tuple[int, int]]: + """ + Given labels and a constraint type, returns the allowed transitions. It will + additionally include transitions for the start and end states, which are used + by the conditional random field. + # Parameters + constraint_type : `str`, required + Indicates which constraint to apply. Current choices are + "BIO", "IOB1", "BIOUL", and "BMES". + labels : `Dict[int, str]`, required + A mapping {label_id -> label}. Most commonly this would be the value from + Vocabulary.get_index_to_token_vocabulary() + # Returns + `List[Tuple[int, int]]` + The allowed transitions (from_label_id, to_label_id). + """ + num_labels = len(labels) + start_tag = num_labels + end_tag = num_labels + 1 + labels_with_boundaries = list(labels.items()) + [ + (start_tag, "START"), + (end_tag, "END"), + ] + + allowed = [] + for from_label_index, from_label in labels_with_boundaries: + if from_label in ("START", "END"): + from_tag = from_label + from_entity = "" + else: + from_tag = from_label[0] + from_entity = from_label[1:] + for to_label_index, to_label in labels_with_boundaries: + if to_label in ("START", "END"): + to_tag = to_label + to_entity = "" + else: + to_tag = to_label[0] + to_entity = to_label[1:] + if is_transition_allowed( + constraint_type, from_tag, from_entity, to_tag, to_entity + ): + allowed.append((from_label_index, to_label_index)) + return allowed + + +def is_transition_allowed( + constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str +): + """ + Given a constraint type and strings `from_tag` and `to_tag` that + represent the origin and destination of the transition, return whether + the transition is allowed under the given constraint type. + # Parameters + constraint_type : `str`, required + Indicates which constraint to apply. Current choices are + "BIO", "IOB1", "BIOUL", and "BMES". + from_tag : `str`, required + The tag that the transition originates from. For example, if the + label is `I-PER`, the `from_tag` is `I`. + from_entity : `str`, required + The entity corresponding to the `from_tag`. For example, if the + label is `I-PER`, the `from_entity` is `PER`. + to_tag : `str`, required + The tag that the transition leads to. For example, if the + label is `I-PER`, the `to_tag` is `I`. + to_entity : `str`, required + The entity corresponding to the `to_tag`. For example, if the + label is `I-PER`, the `to_entity` is `PER`. + # Returns + `bool` + Whether the transition is allowed under the given `constraint_type`. + """ + + if to_tag == "START" or from_tag == "END": + # Cannot transition into START or from END + return False + + if constraint_type == "BIOUL": + if from_tag == "START": + return to_tag in ("O", "B", "U") + if to_tag == "END": + return from_tag in ("O", "L", "U") + return any( + [ + # O can transition to O, B-* or U-* + # L-x can transition to O, B-*, or U-* + # U-x can transition to O, B-*, or U-* + from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"), + # B-x can only transition to I-x or L-x + # I-x can only transition to I-x or L-x + from_tag in ("B", "I") + and to_tag in ("I", "L") + and from_entity == to_entity, + ] + ) + elif constraint_type == "BIO": + if from_tag == "START": + return to_tag in ("O", "B") + if to_tag == "END": + return from_tag in ("O", "B", "I") + return any( + [ + # Can always transition to O or B-x + to_tag in ("O", "B"), + # Can only transition to I-x from B-x or I-x + to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity, + ] + ) + elif constraint_type == "IOB1": + if from_tag == "START": + return to_tag in ("O", "I") + if to_tag == "END": + return from_tag in ("O", "B", "I") + return any( + [ + # Can always transition to O or I-x + to_tag in ("O", "I"), + # Can only transition to B-x from B-x or I-x, where + # x is the same tag. + to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity, + ] + ) + elif constraint_type == "BMES": + if from_tag == "START": + return to_tag in ("B", "S") + if to_tag == "END": + return from_tag in ("E", "S") + return any( + [ + # Can only transition to B or S from E or S. + to_tag in ("B", "S") and from_tag in ("E", "S"), + # Can only transition to M-x from B-x, where + # x is the same tag. + to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity, + # Can only transition to E-x from B-x or M-x, where + # x is the same tag. + to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity, + ] + ) + else: + print("error in constrint type") + + +def logsumexp( + tensor: torch.Tensor, dim: int = -1, keepdim: bool = False +) -> torch.Tensor: + """ + A numerically stable computation of logsumexp. This is mathematically equivalent to + `tensor.exp().sum(dim, keep=keepdim).log()`. This function is typically used for summing log + probabilities. + # Parameters + tensor : `torch.FloatTensor`, required. + A tensor of arbitrary size. + dim : `int`, optional (default = `-1`) + The dimension of the tensor to apply the logsumexp to. + keepdim: `bool`, optional (default = `False`) + Whether to retain a dimension of size one at the dimension we reduce over. + """ + max_score, _ = tensor.max(dim, keepdim=keepdim) + if keepdim: + stable_vec = tensor - max_score + else: + stable_vec = tensor - max_score.unsqueeze(dim) + return max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log() + + +"""## vertibe decode""" + + +def viterbi_decode( + tag_sequence: torch.Tensor, + transition_matrix: torch.Tensor, + tag_observations: Optional[List[int]] = None, + allowed_start_transitions: torch.Tensor = None, + allowed_end_transitions: torch.Tensor = None, + top_k: int = None, +): + """ + Perform Viterbi decoding in log space over a sequence given a transition matrix + specifying pairwise (transition) potentials between tags and a matrix of shape + (sequence_length, num_tags) specifying unary potentials for possible tags per + timestep. + # Parameters + tag_sequence : `torch.Tensor`, required. + A tensor of shape (sequence_length, num_tags) representing scores for + a set of tags over a given sequence. + transition_matrix : `torch.Tensor`, required. + A tensor of shape (num_tags, num_tags) representing the binary potentials + for transitioning between a given pair of tags. + tag_observations : `Optional[List[int]]`, optional, (default = `None`) + A list of length `sequence_length` containing the class ids of observed + elements in the sequence, with unobserved elements being set to -1. Note that + it is possible to provide evidence which results in degenerate labelings if + the sequences of tags you provide as evidence cannot transition between each + other, or those transitions are extremely unlikely. In this situation we log a + warning, but the responsibility for providing self-consistent evidence ultimately + lies with the user. + allowed_start_transitions : `torch.Tensor`, optional, (default = `None`) + An optional tensor of shape (num_tags,) describing which tags the START token + may transition *to*. If provided, additional transition constraints will be used for + determining the start element of the sequence. + allowed_end_transitions : `torch.Tensor`, optional, (default = `None`) + An optional tensor of shape (num_tags,) describing which tags may transition *to* the + end tag. If provided, additional transition constraints will be used for determining + the end element of the sequence. + top_k : `int`, optional, (default = `None`) + Optional integer specifying how many of the top paths to return. For top_k>=1, returns + a tuple of two lists: top_k_paths, top_k_scores, For top_k==None, returns a flattened + tuple with just the top path and its score (not in lists, for backwards compatibility). + # Returns + viterbi_path : `List[int]` + The tag indices of the maximum likelihood tag sequence. + viterbi_score : `torch.Tensor` + The score of the viterbi path. + """ + if top_k is None: + top_k = 1 + flatten_output = True + elif top_k >= 1: + flatten_output = False + else: + raise ValueError( + f"top_k must be either None or an integer >=1. Instead received {top_k}" + ) + + sequence_length, num_tags = list(tag_sequence.size()) + + has_start_end_restrictions = ( + allowed_end_transitions is not None or allowed_start_transitions is not None + ) + + if has_start_end_restrictions: + + if allowed_end_transitions is None: + allowed_end_transitions = torch.zeros(num_tags) + if allowed_start_transitions is None: + allowed_start_transitions = torch.zeros(num_tags) + + num_tags = num_tags + 2 + new_transition_matrix = torch.zeros(num_tags, num_tags) + new_transition_matrix[:-2, :-2] = transition_matrix + + # Start and end transitions are fully defined, but cannot transition between each other. + + allowed_start_transitions = torch.cat( + [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])] + ) + allowed_end_transitions = torch.cat( + [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])] + ) + + # First define how we may transition FROM the start and end tags. + new_transition_matrix[-2, :] = allowed_start_transitions + # We cannot transition from the end tag to any tag. + new_transition_matrix[-1, :] = -math.inf + + new_transition_matrix[:, -1] = allowed_end_transitions + # We cannot transition to the start tag from any tag. + new_transition_matrix[:, -2] = -math.inf + + transition_matrix = new_transition_matrix + + if tag_observations: + if len(tag_observations) != sequence_length: + raise ConfigurationError( + "Observations were provided, but they were not the same length " + "as the sequence. Found sequence of length: {} and evidence: {}".format( + sequence_length, tag_observations + ) + ) + else: + tag_observations = [-1 for _ in range(sequence_length)] + + if has_start_end_restrictions: + tag_observations = [num_tags - 2] + tag_observations + [num_tags - 1] + zero_sentinel = torch.zeros(1, num_tags) + extra_tags_sentinel = torch.ones(sequence_length, 2) * -math.inf + tag_sequence = torch.cat([tag_sequence, extra_tags_sentinel], -1) + tag_sequence = torch.cat([zero_sentinel, tag_sequence, zero_sentinel], 0) + sequence_length = tag_sequence.size(0) + + path_scores = [] + path_indices = [] + + if tag_observations[0] != -1: + one_hot = torch.zeros(num_tags) + one_hot[tag_observations[0]] = 100000.0 + path_scores.append(one_hot.unsqueeze(0)) + else: + path_scores.append(tag_sequence[0, :].unsqueeze(0)) + + # Evaluate the scores for all possible paths. + for timestep in range(1, sequence_length): + # Add pairwise potentials to current scores. + summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix + summed_potentials = summed_potentials.view(-1, num_tags) + + # Best pairwise potential path score from the previous timestep. + max_k = min(summed_potentials.size()[0], top_k) + scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) + + # If we have an observation for this timestep, use it + # instead of the distribution over tags. + observation = tag_observations[timestep] + # Warn the user if they have passed + # invalid/extremely unlikely evidence. + if tag_observations[timestep - 1] != -1 and observation != -1: + if transition_matrix[tag_observations[timestep - 1], observation] < -10000: + logger.warning( + "The pairwise potential between tags you have passed as " + "observations is extremely unlikely. Double check your evidence " + "or transition potentials!" + ) + if observation != -1: + one_hot = torch.zeros(num_tags) + one_hot[observation] = 100000.0 + path_scores.append(one_hot.unsqueeze(0)) + else: + path_scores.append(tag_sequence[timestep, :] + scores) + path_indices.append(paths.squeeze()) + + # Construct the most likely sequence backwards. + path_scores_v = path_scores[-1].view(-1) + max_k = min(path_scores_v.size()[0], top_k) + viterbi_scores, best_paths = torch.topk(path_scores_v, k=max_k, dim=0) + viterbi_paths = [] + for i in range(max_k): + viterbi_path = [best_paths[i]] + for backward_timestep in reversed(path_indices): + viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) + # Reverse the backward path. + viterbi_path.reverse() + + if has_start_end_restrictions: + viterbi_path = viterbi_path[1:-1] + + # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. + viterbi_path = [j % num_tags for j in viterbi_path] + viterbi_paths.append(viterbi_path) + + if flatten_output: + return viterbi_paths[0], viterbi_scores[0] + + return viterbi_paths, viterbi_scores + + +"""## crf algorithm""" + + +class ConditionalRandomField(torch.nn.Module): + """ + This module uses the "forward-backward" algorithm to compute + the log-likelihood of its inputs assuming a conditional random field model. + See, e.g. http://www.cs.columbia.edu/~mcollins/fb.pdf + # Parameters + num_tags : `int`, required + The number of tags. + constraints : `List[Tuple[int, int]]`, optional (default = `None`) + An optional list of allowed transitions (from_tag_id, to_tag_id). + These are applied to `viterbi_tags()` but do not affect `forward()`. + These should be derived from `allowed_transitions` so that the + start and end transitions are handled correctly for your tag type. + include_start_end_transitions : `bool`, optional (default = `True`) + Whether to include the start and end transition parameters. + """ + + # def __init__( + # self, + # num_tags: int, + # constraints: List[Tuple[int, int]] = None, + # include_start_end_transitions: bool = True, + # ) -> None: + def __init__( + self, + num_tags: int, + label_encoding, + idx2tag, + include_start_end_transitions: bool = True, + ) -> None: + super().__init__() + self.num_tags = num_tags + constraints = allowed_transitions(label_encoding, idx2tag) + # transitions[i, j] is the logit for transitioning from state i to state j. + self.transitions = torch.nn.Parameter(torch.Tensor(num_tags, num_tags)) + + # _constraint_mask indicates valid transitions (based on supplied constraints). + # Include special start of sequence (num_tags + 1) and end of sequence tags (num_tags + 2) + if constraints is None: + # All transitions are valid. + constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(1.0) + else: + constraint_mask = torch.Tensor(num_tags + 2, num_tags + 2).fill_(0.0) + for i, j in constraints: + constraint_mask[i, j] = 1.0 + + self._constraint_mask = torch.nn.Parameter(constraint_mask, requires_grad=False) + + # Also need logits for transitioning from "start" state and to "end" state. + self.include_start_end_transitions = include_start_end_transitions + if include_start_end_transitions: + self.start_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) + self.end_transitions = torch.nn.Parameter(torch.Tensor(num_tags)) + + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.xavier_normal_(self.transitions) + if self.include_start_end_transitions: + torch.nn.init.normal_(self.start_transitions) + torch.nn.init.normal_(self.end_transitions) + + def _input_likelihood( + self, logits: torch.Tensor, mask: torch.BoolTensor + ) -> torch.Tensor: + """ + Computes the (batch_size,) denominator term for the log-likelihood, which is the + sum of the likelihoods across all possible state sequences. + """ + batch_size, sequence_length, num_tags = logits.size() + + # Transpose batch size and sequence dimensions + mask = mask.transpose(0, 1).contiguous() + logits = logits.transpose(0, 1).contiguous() + + # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the + # transitions to the initial states and the logits for the first timestep. + if self.include_start_end_transitions: + alpha = self.start_transitions.view(1, num_tags) + logits[0] + else: + alpha = logits[0] + + # For each i we compute logits for the transitions from timestep i-1 to timestep i. + # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are + # (instance, current_tag, next_tag) + for i in range(1, sequence_length): + # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis. + emit_scores = logits[i].view(batch_size, 1, num_tags) + # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis. + transition_scores = self.transitions.view(1, num_tags, num_tags) + # Alpha is for the current_tag, so we broadcast along the next_tag axis. + broadcast_alpha = alpha.view(batch_size, num_tags, 1) + + # Add all the scores together and logexp over the current_tag axis. + inner = broadcast_alpha + emit_scores + transition_scores + + # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension + # of `inner`. Otherwise (mask == False) we want to retain the previous alpha. + alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * ( + ~mask[i] + ).view(batch_size, 1) + + # Every sequence needs to end with a transition to the stop_tag. + if self.include_start_end_transitions: + stops = alpha + self.end_transitions.view(1, num_tags) + else: + stops = alpha + + # Finally we log_sum_exp along the num_tags dim, result is (batch_size,) + return logsumexp(stops) + + def _joint_likelihood( + self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor + ) -> torch.Tensor: + """ + Computes the numerator term for the log-likelihood, which is just score(inputs, tags) + """ + batch_size, sequence_length, _ = logits.data.shape + + # Transpose batch size and sequence dimensions: + logits = logits.transpose(0, 1).contiguous() + mask = mask.transpose(0, 1).contiguous() + tags = tags.transpose(0, 1).contiguous() + + # Start with the transition scores from start_tag to the first tag in each input + if self.include_start_end_transitions: + score = self.start_transitions.index_select(0, tags[0]) + else: + score = 0.0 + + # Add up the scores for the observed transitions and all the inputs but the last + # print(mask.shape, tags.shape, logits.shape, sequence_length) + for i in range(sequence_length - 1): + # Each is shape (batch_size,) + current_tag, next_tag = tags[i], tags[i + 1] + # print(current_tag, next_tag) + # print("tags printiiinggggg") + # print(current_tag, next_tag) + # The scores for transitioning from current_tag to next_tag + transition_score = self.transitions[current_tag.view(-1), next_tag.view(-1)] + + # The score for using current_tag + emit_score = logits[i].gather(1, current_tag.view(batch_size, 1)).squeeze(1) + # emit_score= 0 + # Include transition score if next element is unmasked, + # input_score if this element is unmasked. + score = score + transition_score * mask[i + 1] + emit_score * mask[i] + + # Transition from last state to "stop" state. To start with, we need to find the last tag + # for each instance. + last_tag_index = mask.sum(0).long() - 1 + last_tags = tags.gather(0, last_tag_index.view(1, batch_size)).squeeze(0) + + # Compute score of transitioning to `stop_tag` from each "last tag". + if self.include_start_end_transitions: + last_transition_score = self.end_transitions.index_select(0, last_tags) + else: + last_transition_score = 0.0 + + # Add the last input if it's not masked. + last_inputs = logits[-1] # (batch_size, num_tags) + last_input_score = last_inputs.gather( + 1, last_tags.view(-1, 1) + ) # (batch_size, 1) + last_input_score = last_input_score.squeeze() # (batch_size,) + + score = score + last_transition_score + last_input_score * mask[-1] + + return score + + def forward( + self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None + ) -> torch.Tensor: + """ + Computes the log likelihood. + """ + # mask[tags==-100]=0 + if mask is None: + mask = torch.ones(*tags.size(), dtype=torch.bool) + else: + # The code below fails in weird ways if this isn't a bool tensor, so we make sure. + mask = mask.to(torch.bool) + # print("forward",inputs.shape, tags.shape, mask.shape) + + log_denominator = self._input_likelihood(inputs, mask) + # temp_tags= tags + # tags[tags==-100]=2 + # print(tags[0]) + log_numerator = self._joint_likelihood(inputs, tags, mask) + # tags[mask==0]=-100 + return torch.sum(log_numerator - log_denominator) + + def viterbi_tags( + self, logits: torch.Tensor, mask: torch.BoolTensor = None, top_k: int = None + ) -> Union[List[VITERBI_DECODING], List[List[VITERBI_DECODING]]]: + """ + Uses viterbi algorithm to find most likely tags for the given inputs. + If constraints are applied, disallows all other transitions. + Returns a list of results, of the same size as the batch (one result per batch member) + Each result is a List of length top_k, containing the top K viterbi decodings + Each decoding is a tuple (tag_sequence, viterbi_score) + For backwards compatibility, if top_k is None, then instead returns a flat list of + tag sequences (the top tag sequence for each batch item). + """ + if mask is None: + mask = torch.ones(*logits.shape[:2], dtype=torch.bool, device=logits.device) + + if top_k is None: + top_k = 1 + flatten_output = True + else: + flatten_output = False + + _, max_seq_length, num_tags = logits.size() + + # Get the tensors out of the variables + logits, mask = logits.data, mask.data + + # Augment transitions matrix with start and end transitions + start_tag = num_tags + end_tag = num_tags + 1 + transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0) + + # Apply transition constraints + constrained_transitions = self.transitions * self._constraint_mask[ + :num_tags, :num_tags + ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags]) + transitions[:num_tags, :num_tags] = constrained_transitions.data + + if self.include_start_end_transitions: + transitions[ + start_tag, :num_tags + ] = self.start_transitions.detach() * self._constraint_mask[ + start_tag, :num_tags + ].data + -10000.0 * ( + 1 - self._constraint_mask[start_tag, :num_tags].detach() + ) + transitions[ + :num_tags, end_tag + ] = self.end_transitions.detach() * self._constraint_mask[ + :num_tags, end_tag + ].data + -10000.0 * ( + 1 - self._constraint_mask[:num_tags, end_tag].detach() + ) + else: + transitions[start_tag, :num_tags] = -10000.0 * ( + 1 - self._constraint_mask[start_tag, :num_tags].detach() + ) + transitions[:num_tags, end_tag] = -10000.0 * ( + 1 - self._constraint_mask[:num_tags, end_tag].detach() + ) + + best_paths = [] + # Pad the max sequence length by 2 to account for start_tag + end_tag. + tag_sequence = torch.Tensor(max_seq_length + 2, num_tags + 2) + + for prediction, prediction_mask in zip(logits, mask): + mask_indices = prediction_mask.nonzero(as_tuple=False).squeeze() + masked_prediction = torch.index_select(prediction, 0, mask_indices) + sequence_length = masked_prediction.shape[0] + + # Start with everything totally unlikely + tag_sequence.fill_(-10000.0) + # At timestep 0 we must have the START_TAG + tag_sequence[0, start_tag] = 0.0 + # At steps 1, ..., sequence_length we just use the incoming prediction + tag_sequence[1 : (sequence_length + 1), :num_tags] = masked_prediction + # And at the last timestep we must have the END_TAG + tag_sequence[sequence_length + 1, end_tag] = 0.0 + + # We pass the tags and the transitions to `viterbi_decode`. + viterbi_paths, viterbi_scores = viterbi_decode( + tag_sequence=tag_sequence[: (sequence_length + 2)], + transition_matrix=transitions, + top_k=top_k, + ) + top_k_paths = [] + for viterbi_path, viterbi_score in zip(viterbi_paths, viterbi_scores): + # Get rid of START and END sentinels and append. + viterbi_path = viterbi_path[1:-1] + top_k_paths.append((viterbi_path, viterbi_score.item())) + best_paths.append(top_k_paths) + + if flatten_output: + return [top_k_paths[0] for top_k_paths in best_paths] + + return best_paths + + +"""# CRF Models for Token Classification + +## Bert CRF +""" + +from transformers import ( + AutoModelForPreTraining, + AutoModel, + BertModel, + BertPreTrainedModel, + LongformerModel, + PreTrainedModel, +) +from transformers.modeling_outputs import TokenClassifierOutput +import collections + + +class BERT_CRFforTokenClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # self.crf= nn.Linear(config.num_labels,1) + # self.crf= ConditionalRandomField(self.num_labels) + self.crf = ConditionalRandomField( + self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"} + ) + self.init_weights() + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_hidden_states=None, + output_attentions=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = -self.crf(logits, labels, attention_mask) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + # print(self.crf.transitions) + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def freeze_till_clf(self): + for param in self.bert.parameters(): + param.requires_grad = False + for param in self.dropout.parameters(): + param.requires_grad = False + for param in self.classifier.parameters(): + param.requires_grad = False + + def freeze_encoder_layer(self): + for param in self.bert.parameters(): + param.requires_grad = False + + # return ((loss,) + output) if loss is not None else output + + +"""## longformer CRF """ + +from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel + + +class Longformer_CRFforTokenClassification(LongformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.longformer = LongformerModel(config, add_pooling_layer=False) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # self.crf= nn.Linear(config.num_labels,1) + self.crf = ConditionalRandomField(self.num_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_hidden_states=None, + output_attentions=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.longformer( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = -self.crf(logits, labels, attention_mask) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def freeze_till_clf(self): + for param in self.longformer.parameters(): + param.requires_grad = False + for param in self.dropout.parameters(): + param.requires_grad = False + for param in self.classifier.parameters(): + param.requires_grad = False + + # def freeze_encoder_layer(self): + # for param in self.longformer.parameters(): + # param.requires_grad = Falsefreeze_till_clfr=True): + # super().__init__(config) + # self.config = config + + # self.embeddings = BertEmbeddings(config) + # self.encoder = BertEncoder(config) + + # self.pooler = BertPooler(config) if add_pooling_layer else None + + # self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + guide_embed=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] if past_key_values is not None else 0 + ) + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device + ) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + # assert guide_embed is not None + if guide_embed is not None: + embedding_output[:, 0, :] = guide_embed + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +"""##modi bert token clf + +""" + + +class ModiBertForTokenClassification(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = ModiBertModel(config, add_pooling_layer=False) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + guide_embed=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + guide_embed=guide_embed, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # loss_fct = CrossEntropyLoss(weight=torch.tensor([0.4,0.35,0.25], device= labels.device)) + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, + labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels), + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +"""#Modi Longformer + +## modi base long +""" + +from transformers.models.longformer.modeling_longformer import * + + +class ModiLongformerModel(LongformerPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + if isinstance(config.attention_window, int): + assert ( + config.attention_window % 2 == 0 + ), "`config.attention_window` has to be an even value" + assert ( + config.attention_window > 0 + ), "`config.attention_window` has to be positive" + config.attention_window = [ + config.attention_window + ] * config.num_hidden_layers # one value per layer + else: + assert len(config.attention_window) == config.num_hidden_layers, ( + "`len(config.attention_window)` should equal `config.num_hidden_layers`. " + f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}" + ) + + self.embeddings = LongformerEmbeddings(config) + self.encoder = LongformerEncoder(config) + self.pooler = LongformerPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def _pad_to_window_size( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + token_type_ids: torch.Tensor, + position_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + pad_token_id: int, + ): + """A helper function to pad tokens and mask to work with implementation of Longformer self-attention.""" + # padding + attention_window = ( + self.config.attention_window + if isinstance(self.config.attention_window, int) + else max(self.config.attention_window) + ) + + assert ( + attention_window % 2 == 0 + ), f"`attention_window` should be an even value. Given {attention_window}" + input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape + batch_size, seq_len = input_shape[:2] + + padding_len = (attention_window - seq_len % attention_window) % attention_window + if padding_len > 0: + logger.info( + f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of " + f"`config.attention_window`: {attention_window}" + ) + if input_ids is not None: + input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) + if position_ids is not None: + # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings + position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id) + if inputs_embeds is not None: + input_ids_padding = inputs_embeds.new_full( + (batch_size, padding_len), + self.config.pad_token_id, + dtype=torch.long, + ) + inputs_embeds_padding = self.embeddings(input_ids_padding) + inputs_embeds = torch.cat( + [inputs_embeds, inputs_embeds_padding], dim=-2 + ) + + attention_mask = F.pad( + attention_mask, (0, padding_len), value=False + ) # no attention on the padding tokens + token_type_ids = F.pad( + token_type_ids, (0, padding_len), value=0 + ) # pad with token_type_id = 0 + + return ( + padding_len, + input_ids, + attention_mask, + token_type_ids, + position_ids, + inputs_embeds, + ) + + def _merge_to_attention_mask( + self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor + ): + # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) + # (global_attention_mask + 1) => 1 for local attention, 2 for global attention + # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention + if attention_mask is not None: + attention_mask = attention_mask * (global_attention_mask + 1) + else: + # simply use `global_attention_mask` as `attention_mask` + # if no `attention_mask` is given + attention_mask = global_attention_mask + 1 + return attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + global_attention_mask=None, + head_mask=None, + token_type_ids=None, + guide_embed=None, + position_ids=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # merge `global_attention_mask` and `attention_mask` + if global_attention_mask is not None: + attention_mask = self._merge_to_attention_mask( + attention_mask, global_attention_mask + ) + + ( + padding_len, + input_ids, + attention_mask, + token_type_ids, + position_ids, + inputs_embeds, + ) = self._pad_to_window_size( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + pad_token_id=self.config.pad_token_id, + ) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device + )[:, 0, 0, :] + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + if guide_embed is not None: + embedding_output[:, 0, :] = guide_embed + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + # undo padding + if padding_len > 0: + # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) + sequence_output = sequence_output[:, :-padding_len] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return LongformerBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + global_attentions=encoder_outputs.global_attentions, + ) + + +"""## modi long for token""" + + +class ModiLongformerForTokenClassification(LongformerPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.longformer = ModiLongformerModel(config, add_pooling_layer=False) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + global_attention_mask=None, + head_mask=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + guide_embed=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.longformer( + input_ids, + attention_mask=attention_mask, + global_attention_mask=global_attention_mask, + head_mask=head_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # loss_fct = CrossEntropyLoss(weight=torch.tensor([0.4,0.35,0.25],device= labels.device)) + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, + labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels), + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return LongformerTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + global_attentions=outputs.global_attentions, + ) + + +"""#CRF trainer""" + +from transformers.trainer import * +from transformers import ( + Trainer, + set_seed, +) + +# from Trainer import * +from transformers.trainer_utils import PredictionOutput +from torch import nn +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import Dataset +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + + +class CRF_Trainer(Trainer): + def prediction_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> PredictionOutput: + """ + Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. + + Works both with or without labels. + """ + if not isinstance(dataloader.dataset, collections.abc.Sized): + raise ValueError("dataset must implement __len__") + prediction_loss_only = ( + prediction_loss_only + if prediction_loss_only is not None + else self.args.prediction_loss_only + ) + + if self.args.deepspeed and not self.args.do_train: + # no harm, but flagging to the user that deepspeed config is ignored for eval + # flagging only for when --do_train wasn't passed as only then it's redundant + logger.info( + "Detected the deepspeed argument but it will not be used for evaluation" + ) + + model = self._wrap_model(self.model, training=False) + + # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while + # ``train`` is running, half it first and then put on device + if not self.is_in_train and self.args.fp16_full_eval: + model = model.half().to(self.args.device) + + batch_size = dataloader.batch_size + num_examples = self.num_examples(dataloader) + logger.info(f"***** Running {description} *****") + logger.info(f" Num examples = {num_examples}") + logger.info(f" Batch size = {batch_size}") + losses_host: torch.Tensor = None + preds_host: Union[torch.Tensor, List[torch.Tensor]] = None + labels_host: Union[torch.Tensor, List[torch.Tensor]] = None + + world_size = max(1, self.args.world_size) + + eval_losses_gatherer = DistributedTensorGatherer( + world_size, num_examples, make_multiple_of=batch_size + ) + if not prediction_loss_only: + # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass + # a batch size to the sampler) + make_multiple_of = None + if hasattr(dataloader, "sampler") and isinstance( + dataloader.sampler, SequentialDistributedSampler + ): + make_multiple_of = dataloader.sampler.batch_size + preds_gatherer = DistributedTensorGatherer( + world_size, num_examples, make_multiple_of=make_multiple_of + ) + labels_gatherer = DistributedTensorGatherer( + world_size, num_examples, make_multiple_of=make_multiple_of + ) + if self.args.past_index >= 0: + self._past = None + model.eval() + + if is_torch_tpu_available(): + dataloader = pl.ParallelLoader( + dataloader, [self.args.device] + ).per_device_loader(self.args.device) + + self.callback_handler.eval_dataloader = dataloader + + for step, inputs in enumerate(dataloader): + + loss, logits, labels = self.prediction_step( + model, inputs, prediction_loss_only, ignore_keys=ignore_keys + ) + + best_path = self.eval_step(model, logits, inputs["attention_mask"]) + # best_path= self.eval_step(model, logits) + # print(len(best_path), best_path[0]) + # logits= torch.zeros() + + best_path = [x for x, _ in best_path] + # print(best_path) + # seq_len= labels.shape[1] + logits *= 0 + for i, path in enumerate(best_path): + # print(inputs['attention_mask'][i,0], labels[i,0], inputs['attention_mask'][i,-1], labels[i,-1]) + # print(len(x)) + for j, tag in enumerate(path): + logits[i, j, int(tag)] = 1 + # print(inputs['attention_mask'][i,j], labels[i,j]) + + # logits= torch.tensor(data=best_path, dtype= labels.dtype, device= labels.device) + # if(logits.shape!=labels.shape): + # print(logits.shape,labels.shape) + # assert logits.shape==labels.shape + if loss is not None: + losses = loss.repeat(batch_size) + losses_host = ( + losses + if losses_host is None + else torch.cat((losses_host, losses), dim=0) + ) + if logits is not None: + preds_host = ( + logits + if preds_host is None + else nested_concat(preds_host, logits, padding_index=-100) + ) + if labels is not None: + labels_host = ( + labels + if labels_host is None + else nested_concat(labels_host, labels, padding_index=-100) + ) + self.control = self.callback_handler.on_prediction_step( + self.args, self.state, self.control + ) + + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + if ( + self.args.eval_accumulation_steps is not None + and (step + 1) % self.args.eval_accumulation_steps == 0 + ): + eval_losses_gatherer.add_arrays( + self._gather_and_numpify(losses_host, "eval_losses") + ) + if not prediction_loss_only: + preds_gatherer.add_arrays( + self._gather_and_numpify(preds_host, "eval_preds") + ) + labels_gatherer.add_arrays( + self._gather_and_numpify(labels_host, "eval_label_ids") + ) + + # Set back to None to begin a new accumulation + losses_host, preds_host, labels_host = None, None, None + + if self.args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + eval_losses_gatherer.add_arrays( + self._gather_and_numpify(losses_host, "eval_losses") + ) + if not prediction_loss_only: + preds_gatherer.add_arrays( + self._gather_and_numpify(preds_host, "eval_preds") + ) + labels_gatherer.add_arrays( + self._gather_and_numpify(labels_host, "eval_label_ids") + ) + + eval_loss = eval_losses_gatherer.finalize() + preds = preds_gatherer.finalize() if not prediction_loss_only else None + label_ids = labels_gatherer.finalize() if not prediction_loss_only else None + + if ( + self.compute_metrics is not None + and preds is not None + and label_ids is not None + ): + metrics = self.compute_metrics( + EvalPrediction(predictions=preds, label_ids=label_ids) + ) + else: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if eval_loss is not None: + metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) + + def eval_step(self, model: nn.Module, logits, mask=None, top_k=None): + with torch.no_grad(): + output = model.crf.viterbi_tags(logits, mask, top_k) + + return output + + def compute_loss(self, model, inputs, return_outputs=False): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + + Subclass and override for custom behavior. + """ + # if self.label_smoother is not None and "labels" in inputs: + # labels = inputs.pop("labels") + # else: + labels = None + # print(model) + # assert "labels" in inputs + # print(type(inputs),inputs) + outputs = model(**inputs) + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + loss = self.label_smoother(outputs, labels) + else: + # We don't use .loss here since the model may return tuples instead of ModelOutput. + # print(outputs) + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + # print("loss is ", loss) + return (loss, outputs) if return_outputs else loss + + +[4, 5] + [5, 6, 7] + +"""# KP ectraction main code + +## model and data argument +""" + +# run_kpe.py +# all long docu,emt modesl realted to KP +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for token classification. +""" +# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as +# comments. + +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +from datasets import ClassLabel, load_dataset, load_metric + +import transformers +from transformers import ( + AutoConfig, + AutoModelForTokenClassification, + AutoTokenizer, + DataCollatorForTokenClassification, + HfArgumentParser, + PreTrainedTokenizerFast, + Trainer, + TrainingArguments, + set_seed, + BertForTokenClassification, +) +from transformers.trainer_utils import get_last_checkpoint, is_main_process + + +logger = logging.getLogger(__name__) +# from models.long_doc_kp_models import LONG_DOC_KP_MODELS + +# KPE_MODELS_DICT={ +# 'others': AutoModelForTokenClassification, +# "longformer": +# 'reformer' : +# 'crf_longformer' : +# 'crf_bert': BERT_CRFforTokenClassification +# } + +CRF_MODEL_DICT = { + "bert": BERT_CRFforTokenClassification, + "longformer": Longformer_CRFforTokenClassification, +} +TOKEN_MODEL_DICT = { + "bert": BertForTokenClassification, + "longformer": LongformerForTokenClassification, + "reformer": ReformerForTokenClassification, + # 'bigbird':BigBirdForTokenClassification +} +GUIDED_MODEL_DICT = { + "bert": ModiBertForTokenClassification, + "longformer": ModiLongformerForTokenClassification, +} +MODEL_DICT = { + "crf": CRF_MODEL_DICT, + "simple": TOKEN_MODEL_DICT, + "guided": GUIDED_MODEL_DICT, +} + +# KPE_MODELS_DICT = KPE_MODELS_DICT | LONG_DOC_KP_MODELS + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_family_name: str = field( + metadata={ + "help": "name of the family of model, bert, longformer, reformer etc." + } + ) + model_name_or_path: str = field( + metadata={ + "help": "Path to pretrained model or model identifier from huggingface.co/models" + } + ) + config_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained config name or path if not the same as model_name" + }, + ) + tokenizer_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained tokenizer name or path if not the same as model_name" + }, + ) + cache_dir: Optional[str] = field( + default=None, + metadata={ + "help": "Where do you want to store the pretrained models downloaded from huggingface.co" + }, + ) + model_revision: str = field( + default="main", + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)." + }, + ) + use_CRF: bool = field( + default=False, + metadata={"help": "wether to use CRF on top of the classifier"}, + ) + use_BiLSTM: bool = field( + default=False, + metadata={"help": "use BiLSTM in sequence classification"}, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + task_name: Optional[str] = field( + default="simple", metadata={"help": "The name of the task simple, crf"} + ) + + train_file: Optional[str] = field( + default=None, + metadata={"help": "The input training data file (a csv or JSON file)."}, + ) + validation_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)." + }, + ) + test_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input test data file to predict on (a csv or JSON file)." + }, + ) + return_entity_level_metrics: bool = field( + default=False, metadata={"help": "calculate entity level metric"} + ) + overwrite_cache: bool = field( + default=False, + metadata={"help": "Overwrite the cached training and evaluation sets"}, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + label_all_tokens: bool = field( + default=False, + metadata={ + "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " + "one (in which case the other tokens will have a padding index)." + }, + ) + return_entity_level_metrics: bool = field( + default=False, + metadata={ + "help": "Whether to return all the entity levels during evaluation or just the overall ones." + }, + ) + dataset_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the dataset to use (via the datasets library)."}, + ) + dataset_config_name: Optional[str] = field( + default=None, + metadata={ + "help": "The configuration name of the dataset to use (via the datasets library)." + }, + ) + cache_file_name: Optional[str] = field( + default=None, + metadata={ + "help": "Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name." + }, + ) + + def __post_init__(self): + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + ): + raise ValueError( + "Need either a dataset name or a training/validation file." + ) + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in [ + "csv", + "json", + ], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in [ + "csv", + "json", + ], "`validation_file` should be a csv or a json file." + self.task_name = self.task_name.lower() + + +# def main(): + +"""## main trainer function""" + +TRAINER_DICT = {"crf": CRF_Trainer, "simple": Trainer, "guided": Trainer} + + +def main_run_kpe(model_args, data_args, training_args): + + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # # If we pass only one argument to the script and it's the path to a json file, + # # let's parse it to get our arguments. + # model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + # else: + # model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Detecting last checkpoint. + last_checkpoint = None + if ( + os.path.isdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel( + logging.INFO if is_main_process(training_args.local_rank) else logging.INFO + ) + # logger.set_global_logging_level(logging.INFO) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + ## get dataset in here + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.test_file.split(".")[-1] + datasets = load_dataset( + extension, data_files=data_files + ) ##CR get dataset in here + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + if training_args.do_train: + column_names = datasets["train"].column_names + features = datasets["train"].features + else: + column_names = datasets["validation"].column_names + features = datasets["validation"].features + text_column_name = "text" if "text" in column_names else column_names[0] + label_column_name = "BIO_tags" if "BIO_tags" in column_names else column_names[1] + + # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the + # unique labels. + def get_label_list(labels): + unique_labels = set() + for label in labels: + unique_labels = unique_labels | set(label) + label_list = list(unique_labels) + label_list.sort() + return label_list + + if isinstance(features[label_column_name].feature, ClassLabel): + label_list = features[label_column_name].feature.names + # No need to convert the labels since they are already ints. + label_to_id = {i: i for i in range(len(label_list))} + else: + label_list = get_label_list( + datasets["train"][label_column_name] + if training_args.do_train + else datasets["validation"][label_column_name] + ) + label_to_id = {l: i for i, l in enumerate(label_list)} + num_labels = len(label_list) + print(label_to_id) + id2tag = {} + for k in label_to_id.keys(): + id2tag[label_to_id[k]] = k + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name + if model_args.config_name + else model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=model_args.cache_dir, + ) + config.use_CRF = model_args.use_CRF ##CR replace from arguments + config.use_BiLSTM = False + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name + if model_args.tokenizer_name + else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=True, + add_prefix_space=True, + ) + model = MODEL_DICT[data_args.task_name][ + model_args.model_family_name + ].from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=model_args.cache_dir, + ) + # model.freeze_encoder_layer() + print("model") + # print(model) + if tokenizer.pad_token is None: + + tokenizer.pad_token = tokenizer.eos_token + config.pad_token_id = config.eos_token_id + + # Tokenizer check: this script requires a fast tokenizer. + # if not isinstance(tokenizer, PreTrainedTokenizerFast): + # raise ValueError( + # "This example script only works for models that have a fast tokenizer. Checkout the big table of models " + # "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " + # "requirement" + # ) + + # Preprocessing the dataset + # Padding strategy + padding = "max_length" if data_args.pad_to_max_length else False + + # Tokenize all texts and align the labels with them. + def tokenize_and_align_labels(examples): + tokenized_inputs = tokenizer( + examples[text_column_name], + padding=padding, + truncation=True, + # We use this argument because the texts in our dataset are lists of words (with a label for each word). + is_split_into_words=True, + ) + labels = [] + for i, label in enumerate(examples[label_column_name]): + word_ids = tokenized_inputs.word_ids(batch_index=i) + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + # Special tokens have a word id that is None. We set the label to -100 so they are automatically + # ignored in the loss function. + if word_idx is None: + # label_ids.append(-100) + label_ids.append( + 2 + ) # to avoid error change -100 to 'O' tag i.e. 2 class + # We set the label for the first token of each word. + elif word_idx != previous_word_idx: + label_ids.append(label_to_id[label[word_idx]]) + # For the other tokens in a word, we set the label to either the current label or -100, depending on + # the label_all_tokens flag. + else: + label_ids.append( + label_to_id[label[word_idx]] + if data_args.label_all_tokens + else -100 + ) + # to avoid error change -100 to 'O' tag i.e. 2 class + # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2) + previous_word_idx = word_idx + + labels.append(label_ids) + if data_args.task_name == "guided": + tokenized_inputs["guide_embed"] = examples["guide_embed"] + tokenized_inputs["labels"] = labels + # tokenized_inputs['paper_id']= examples['paper_id'] + # tokenized_inputs['extractive_keyphrases']= examples['extractive_keyphrases'] + + return tokenized_inputs + + tokenized_datasets = datasets.map( + tokenize_and_align_labels, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + # cache_file_name= data_args.cache_file_name + ) + + # Data collator + data_collator = DataCollatorForTokenClassification( + tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None + ) + + from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score + from seqeval.scheme import IOB2, IOB1 + + def compute_metrics(p): + predictions, labels = p + # print(predictions.shape, labels.shape) + # if model_args.use_CRF is False: + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens) + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + # results = metric.compute(predictions=true_predictions, references=true_labels) + results = {} + # print("cal precisi") + results["overall_precision"] = precision_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + results["overall_recall"] = recall_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + # print("cal f1") + results["overall_f1"] = f1_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + results["overall_accuracy"] = accuracy_score(true_labels, true_predictions) + if data_args.return_entity_level_metrics: + # Unpack nested dictionaries + final_results = {} + # print("cal entity level mat") + for key, value in results.items(): + if isinstance(value, dict): + for n, v in value.items(): + final_results[f"{key}_{n}"] = v + else: + final_results[key] = value + return final_results + else: + return { + "precision": results["overall_precision"], + "recall": results["overall_recall"], + "f1": results["overall_f1"], + "accuracy": results["overall_accuracy"], + } + + # Initialize our Trainer + # metric = load_metric("seqeval") + + # def compute_metrics(p): + # predictions, labels = p + # predictions = np.argmax(predictions, axis=2) + + # # Remove ignored index (special tokens) + # true_predictions = [ + # [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + # for prediction, label in zip(predictions, labels) + # ] + # true_labels = [ + # [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + # for prediction, label in zip(predictions, labels) + # ] + + # results = metric.compute(predictions=true_predictions, references=true_labels) + # if data_args.return_entity_level_metrics: + # # Unpack nested dictionaries + # final_results = {} + # for key, value in results.items(): + # if isinstance(value, dict): + # for n, v in value.items(): + # final_results[f"{key}_{n}"] = v + # else: + # final_results[key] = value + # return final_results + # else: + # return { + # "precision": results["overall_precision"], + # "recall": results["overall_recall"], + # "f1": results["overall_f1"], + # "accuracy": results["overall_accuracy"], + # } + + trainer = TRAINER_DICT[data_args.task_name]( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"] if training_args.do_train else None, + eval_dataset=tokenized_datasets["validation"] + if training_args.do_eval + else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + if last_checkpoint is not None: + checkpoint = last_checkpoint + elif os.path.isdir(model_args.model_name_or_path): + checkpoint = model_args.model_name_or_path + else: + checkpoint = None + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + output_train_file = os.path.join(training_args.output_dir, "train_results.txt") + if trainer.is_world_process_zero(): + with open(output_train_file, "w") as writer: + logger.info("***** Train results *****") + for key, value in sorted(train_result.metrics.items()): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") + + # Need to save the state, since Trainer.save_model saves only the tokenizer with the model + trainer.state.save_to_json( + os.path.join(training_args.output_dir, "trainer_state.json") + ) + + # Evaluation + results = {} + # if training_args.do_eval: + + # logger.info("*** Evaluate ***") + + # results = trainer.evaluate() + + # output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt") + # if trainer.is_world_process_zero(): + # with open(output_eval_file, "w") as writer: + # logger.info("***** Eval results *****") + # for key, value in results.items(): + # logger.info(f" {key} = {value}") + # writer.write(f"{key} = {value}\n") + + # Predict + if training_args.do_predict: + logger.info("*** Predict ***") + + test_dataset = tokenized_datasets["test"] + predictions, labels, metrics = trainer.predict(test_dataset) + # if model_args.use_CRF is False: + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens) + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + output_test_results_file = os.path.join( + training_args.output_dir, "test_results.txt" + ) + if trainer.is_world_process_zero(): + with open(output_test_results_file, "w") as writer: + for key, value in sorted(metrics.items()): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") + + # Save predictions + def get_kp_from_BIO(examples, i): + # kps= [] + # for i in range(len(prediction)): + ids = examples["input_ids"] + # print(examples.keys()) + + # print(tags) + def mmkp(tag_): + current_kps = [] + ckp = [] + prev_tag = None + for j, tag in enumerate(tag_): + id = ids[j] + + if tag == "O" and len(ckp) > 0: + + current_kps.append(ckp) + ckp = [] + elif tag == "B": + # print(ckp, tag) + if ( + tokenizer.convert_ids_to_tokens(id).startswith("##") + or prev_tag == "B" + ): + ckp.append(id) + else: + if len(ckp) > 0: + current_kps.append(ckp) + ckp = [] + + ckp.append(id) + # print(ckp, id) + + elif tag == "I" and len(ckp) > 0: + ckp.append(id) + prev_tag = tag + decoded_kps = [] + if len(ckp) > 0: + current_kps.append(ckp) + if len(current_kps) > 0: + decoded_kps = tokenizer.batch_decode( + current_kps, + skip_special_tokens=True, + clean_up_tokenization_spaces=True, + ) + # print(decoded_kps) + return decoded_kps + + tags = true_predictions[i] + decoded_kps = mmkp(tags) + + ttgs = true_labels[i] + eekp = mmkp(ttgs) + + # examples['kp_predicted']= decoded_kps + examples["kp_predicted"] = list(dict.fromkeys(decoded_kps)) + examples["eekp"] = list(dict.fromkeys(eekp)) + # examples['eekp']= eekp + # else: + # examples['kp_predicted']= [''] + examples["id"] = i + return examples + + import pandas as pd + + output_test_predictions_file = os.path.join( + training_args.output_dir, "test_predictions.csv" + ) + output_test_predictions_BIO_file = os.path.join( + training_args.output_dir, "test_predictions_BIO.txt" + ) + if trainer.is_world_process_zero(): + print(test_dataset, len(test_dataset["paper_id"])) + ppid = test_dataset["paper_id"] + # ekp= test_dataset['extractive_keyphrases'] + + test_dataset = test_dataset.map( + get_kp_from_BIO, + num_proc=data_args.preprocessing_num_workers, + with_indices=True, + ) + # input_columns= ['paper_id','input_ids','extractive_keyphrases'] + print(test_dataset, " agian") + df = pd.DataFrame.from_dict( + { + "id": ppid, + "extractive_keyphrase": test_dataset["eekp"], + "keyphrases": test_dataset["kp_predicted"], + } + ) + df.to_csv(output_test_predictions_file, index=False) + + # get BIO tag files + + with open(output_test_predictions_BIO_file, "w") as writer: + for prediction in true_predictions: + writer.write(" ".join(prediction) + "\n") + + return results + + +"""# guided long former""" +# please select one of ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup'] +def longformer_guided_runner(): + training_args = TrainingArguments( + output_dir="/media/nas_mount/Debanjan/amardeep/output/long_sml_oakgx_abs_guided", # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=4, + per_device_train_batch_size=2, + per_device_eval_batch_size=16, + gradient_accumulation_steps=4, + do_train=True, + do_eval=True, + evaluation_strategy="steps", + save_steps=1000, + eval_steps=1000, + # lr_scheduler_type= 'cosine', + warmup_steps=100, + logging_steps=100 + # weight_decay =0.001 + ) + mdl_args = ModelArguments( + model_family_name="longformer", + model_name_or_path="allenai/longformer-base-4096", + use_CRF=False, + ) + data_args = DataTrainingArguments( + task_name="guided", + train_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/small_abs_guided/sml_train_abs_oagkx.json", + validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/small_abs_guided/sml_test_abs_oagkx.json", + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=8, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +def longformer_runner(): + training_args = TrainingArguments( + output_dir="/media/nas_mount/Debanjan/amardeep/output/longformer_medium_kp20k_try", # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=4, + per_device_train_batch_size=2, + per_device_eval_batch_size=16, + gradient_accumulation_steps=4, + do_train=True, + do_eval=True, + evaluation_strategy="steps", + save_steps=1000, + eval_steps=1000, + # lr_scheduler_type= 'cosine', + warmup_steps=200, + logging_steps=100 + # weight_decay =0.001 + ) + mdl_args = ModelArguments( + model_family_name="longformer", + model_name_or_path="allenai/longformer-base-4096", + use_CRF=False, + ) + data_args = DataTrainingArguments( + task_name="simple", + train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json", + validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json", + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=8, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +# longformer_modi_runner() +import os + + +def longformer_guided_predict(ckpt, val, test, out=None): + if out is None: + out = ckpt + training_args = TrainingArguments( + output_dir=os.path.join(out, "predict/"), # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=3, + per_device_train_batch_size=2, + per_device_eval_batch_size=16, + gradient_accumulation_steps=2, + do_train=False, + do_eval=False, + do_predict=True, + evaluation_strategy="steps", + save_steps=750, + eval_steps=750, + # lr_scheduler_type= 'cosine', + warmup_steps=50, + logging_steps=50 + # weight_decay =0.001 + ) + mdl_args = ModelArguments( + model_family_name="longformer", model_name_or_path=ckpt, use_CRF=False + ) + data_args = DataTrainingArguments( + task_name="guided", + # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json", + validation_file=val, + test_file=test, + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=5, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +def longformer_predict(ckpt, val, test, out=None): + if out is None: + out = ckpt + training_args = TrainingArguments( + output_dir=os.path.join(out, "predict/"), # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=3, + per_device_train_batch_size=1, + per_device_eval_batch_size=8, + gradient_accumulation_steps=1, + do_train=False, + do_eval=False, + do_predict=True, + evaluation_strategy="steps", + save_steps=750, + eval_steps=750, + # lr_scheduler_type= 'cosine', + warmup_steps=50, + logging_steps=50 + # weight_decay =0.001 + ) + mdl_args = ModelArguments( + model_family_name="longformer", model_name_or_path=ckpt, use_CRF=False + ) + data_args = DataTrainingArguments( + task_name="simple", + # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json", + validation_file=val, + test_file=test, + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=8, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +"""# guided BERT""" + + +def bert_guided_runner(): + training_args = TrainingArguments( + output_dir="/media/nas_mount/Debanjan/amardeep/output/bert_inspec_text_rank_bert", # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=4, + per_device_train_batch_size=4, + per_device_eval_batch_size=16, + do_train=True, + do_eval=True, + evaluation_strategy="steps", + save_steps=200, + eval_steps=200, + logging_steps=50, + # weight_decay =0.001 + ) + mdl_args = ModelArguments( + model_family_name="bert", model_name_or_path="bert-base-uncased", use_CRF=False + ) + data_args = DataTrainingArguments( + task_name="guided", + train_file="/media/nas_mount/Debanjan/amardeep/proc_data/inspec/conll_train_textrank_inspec.json", + validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/inspec/conll_valid_textrank_inspec.json", + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=5, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +def bert_guided_predict(ckpt, val, test, out=None): + if out is None: + out = ckpt + training_args = TrainingArguments( + output_dir=os.path.join(out, "predict/"), # todo + learning_rate=3e-5, + overwrite_output_dir=True, + per_device_eval_batch_size=64, + # gradient_accumulation_steps=2, + do_train=False, + do_eval=False, + do_predict=True, + ) + mdl_args = ModelArguments( + model_family_name="bert", model_name_or_path=ckpt, use_CRF=False + ) + data_args = DataTrainingArguments( + task_name="guided", + # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json", + validation_file=val, + test_file=test, + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=5, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +# bert_modi_runner() + +"""#BERT [for testing PoV] + +## bert plane +""" + + +def bert_runner(): + training_args = TrainingArguments( + output_dir="/media/nas_mount/Debanjan/amardeep/output/bert_small_oagkx", # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=4, + per_device_train_batch_size=8, + per_device_eval_batch_size=32, + # gradient_accumulation_steps=2, + do_train=True, + do_eval=True, + evaluation_strategy="steps", + save_steps=750, + eval_steps=750, + # lr_scheduler_type= 'cosine', + warmup_steps=80, + logging_steps=100 + # weight_decay =0.001 + ) + mdl_args = ModelArguments( + model_family_name="bert", model_name_or_path="bert-base-uncased", use_CRF=False + ) + data_args = DataTrainingArguments( + task_name="simple", + train_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/conll_small_train_oagkx.json", + validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/oagkx/conll_small_valid_oagkx.json", + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=8, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +def bert_predict(ckpt, val, test, out=None): + if out is None: + out = ckpt + training_args = TrainingArguments( + output_dir=os.path.join(out, "predict/"), # todo + learning_rate=3e-5, + overwrite_output_dir=True, + per_device_eval_batch_size=16, + # gradient_accumulation_steps=2, + do_train=False, + do_eval=False, + do_predict=True, + ) + mdl_args = ModelArguments( + model_family_name="bert", model_name_or_path=ckpt, use_CRF=False + ) + data_args = DataTrainingArguments( + task_name="simple", + # train_file= "/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll_bert_text_rank/train_m_TR_B.json", + validation_file=val, + test_file=test, + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=5, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +for i in range(torch.cuda.device_count()): + print("davailabel gpus are") + print(torch.cuda.get_device_name(i)) + + +def bigbird_runner(): + training_args = TrainingArguments( + output_dir="/media/nas_mount/Debanjan/amardeep/output/bigbird_medium_kp20k", # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=4, + per_device_train_batch_size=2, + per_device_eval_batch_size=2, + gradient_accumulation_steps=4, + do_train=True, + do_eval=True, + evaluation_strategy="steps", + save_steps=10, + eval_steps=10, + # lr_scheduler_type= 'cosine', + warmup_steps=200, + logging_steps=100 + # weight_decay =0.001 + ) + mdl_args = ModelArguments( + model_family_name="bigbird", + model_name_or_path="google/bigbird-roberta-base", + use_CRF=False, + ) + + data_args = DataTrainingArguments( + task_name="simple", + train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json", + validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json", + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=8, + return_entity_level_metrics=True, + ) + + main_run_kpe(model_args=mdl_args, data_args=data_args, training_args=training_args) + + +# bigbird_runner() + +# longformer_guided_predict() +# bert_guided_predict() +# bert_modi_runner() +longformer_runner() +# bert_guided_runner() +# bert_crf_runner() +# bert_runner() +# longformer_guided_runner() From 2e04bfe1807b359e5000b06de525885e42f2c9a6 Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Wed, 26 Jan 2022 22:17:55 +0530 Subject: [PATCH 06/17] f formatting and re arch --- dlkp/kp_metrics/metrics.py | 52 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 dlkp/kp_metrics/metrics.py diff --git a/dlkp/kp_metrics/metrics.py b/dlkp/kp_metrics/metrics.py new file mode 100644 index 0000000..73328d2 --- /dev/null +++ b/dlkp/kp_metrics/metrics.py @@ -0,0 +1,52 @@ +from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score +from seqeval.scheme import IOB2, IOB1 + + +def compute_metrics(p): + predictions, labels = p + # print(predictions.shape, labels.shape) + # if model_args.use_CRF is False: + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens) + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + # results = metric.compute(predictions=true_predictions, references=true_labels) + results = {} + # print("cal precisi") + results["overall_precision"] = precision_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + results["overall_recall"] = recall_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + # print("cal f1") + results["overall_f1"] = f1_score( + true_labels, true_predictions, mode="strict", scheme=IOB2 + ) + results["overall_accuracy"] = accuracy_score(true_labels, true_predictions) + if data_args.return_entity_level_metrics: + # Unpack nested dictionaries + final_results = {} + # print("cal entity level mat") + for key, value in results.items(): + if isinstance(value, dict): + for n, v in value.items(): + final_results[f"{key}_{n}"] = v + else: + final_results[key] = value + return final_results + else: + return { + "precision": results["overall_precision"], + "recall": results["overall_recall"], + "f1": results["overall_f1"], + "accuracy": results["overall_accuracy"], + } From 6c8fda818f9e86dec592da56988e5015529fc1dd Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Wed, 26 Jan 2022 22:28:18 +0530 Subject: [PATCH 07/17] f formatting and re arch --- dlkp/models/ke/kpe.py | 74 ++----------------------------------------- 1 file changed, 2 insertions(+), 72 deletions(-) diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index 556048b..721d284 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -75,7 +75,6 @@ def main_run_kpe(model_args, data_args, training_args): # See all possible arguments in src/transformers/training_args.py - # parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): @@ -110,9 +109,7 @@ def main_run_kpe(model_args, data_args, training_args): datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) - logger.setLevel( - logging.INFO if is_main_process(training_args.local_rank) else logging.INFO - ) + logger.setLevel(logging.INFO) # logger.set_global_logging_level(logging.INFO) # Log on each process the small summary: @@ -130,16 +127,6 @@ def main_run_kpe(model_args, data_args, training_args): # Set seed before initializing model. set_seed(training_args.seed) - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - ## get dataset in here if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) @@ -154,12 +141,7 @@ def main_run_kpe(model_args, data_args, training_args): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - datasets = load_dataset( - extension, data_files=data_files - ) ##CR get dataset in here - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - + datasets = load_dataset(extension, data_files=data_files) if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features @@ -303,58 +285,6 @@ def tokenize_and_align_labels(examples): tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None ) - from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score - from seqeval.scheme import IOB2, IOB1 - - def compute_metrics(p): - predictions, labels = p - # print(predictions.shape, labels.shape) - # if model_args.use_CRF is False: - predictions = np.argmax(predictions, axis=2) - - # Remove ignored index (special tokens) - true_predictions = [ - [label_list[p] for (p, l) in zip(prediction, label) if l != -100] - for prediction, label in zip(predictions, labels) - ] - true_labels = [ - [label_list[l] for (p, l) in zip(prediction, label) if l != -100] - for prediction, label in zip(predictions, labels) - ] - - # results = metric.compute(predictions=true_predictions, references=true_labels) - results = {} - # print("cal precisi") - results["overall_precision"] = precision_score( - true_labels, true_predictions, mode="strict", scheme=IOB2 - ) - results["overall_recall"] = recall_score( - true_labels, true_predictions, mode="strict", scheme=IOB2 - ) - # print("cal f1") - results["overall_f1"] = f1_score( - true_labels, true_predictions, mode="strict", scheme=IOB2 - ) - results["overall_accuracy"] = accuracy_score(true_labels, true_predictions) - if data_args.return_entity_level_metrics: - # Unpack nested dictionaries - final_results = {} - # print("cal entity level mat") - for key, value in results.items(): - if isinstance(value, dict): - for n, v in value.items(): - final_results[f"{key}_{n}"] = v - else: - final_results[key] = value - return final_results - else: - return { - "precision": results["overall_precision"], - "recall": results["overall_recall"], - "f1": results["overall_f1"], - "accuracy": results["overall_accuracy"], - } - # Initialize our Trainer # metric = load_metric("seqeval") From d847f7f014236ad50c4e62077c232af9dea8218e Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Wed, 26 Jan 2022 22:28:27 +0530 Subject: [PATCH 08/17] f formatting and re arch --- dlkp/datasets/pre_process.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 dlkp/datasets/pre_process.py diff --git a/dlkp/datasets/pre_process.py b/dlkp/datasets/pre_process.py new file mode 100644 index 0000000..e69de29 From 5d08559abc58dd6a5b3e00c4efd2f55c26753372 Mon Sep 17 00:00:00 2001 From: Amardeep Date: Sat, 5 Feb 2022 01:19:28 +0530 Subject: [PATCH 09/17] added AutoModel for CRf classification ca etc --- dlkp/kp_metrics/metrics.py | 13 ++++++++----- dlkp/models/ke/crf/crf_utils.py | 6 +++++- dlkp/models/ke/extraction_utils.py | 2 +- .../ke/transformer/token_classification_models.py | 3 --- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/dlkp/kp_metrics/metrics.py b/dlkp/kp_metrics/metrics.py index 73328d2..33cc2e0 100644 --- a/dlkp/kp_metrics/metrics.py +++ b/dlkp/kp_metrics/metrics.py @@ -1,20 +1,23 @@ from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score from seqeval.scheme import IOB2, IOB1 +import numpy as np -def compute_metrics(p): - predictions, labels = p +def compute_metrics( + predictions, labels, return_entity_level_metrics=True, ignore_value=-100 +): + # predictions, labels = p # print(predictions.shape, labels.shape) # if model_args.use_CRF is False: predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [ - [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + [p for (p, l) in zip(prediction, label) if l != ignore_value] for prediction, label in zip(predictions, labels) ] true_labels = [ - [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + [l for (p, l) in zip(prediction, label) if l != ignore_value] for prediction, label in zip(predictions, labels) ] @@ -32,7 +35,7 @@ def compute_metrics(p): true_labels, true_predictions, mode="strict", scheme=IOB2 ) results["overall_accuracy"] = accuracy_score(true_labels, true_predictions) - if data_args.return_entity_level_metrics: + if return_entity_level_metrics: # Unpack nested dictionaries final_results = {} # print("cal entity level mat") diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py index 1d1c240..45e3d6b 100644 --- a/dlkp/models/ke/crf/crf_utils.py +++ b/dlkp/models/ke/crf/crf_utils.py @@ -1,15 +1,19 @@ """ Conditional random field utilis file """ -from typing import List, Tuple, Dict, Union +from typing import List, Tuple, Dict, Union, Optional import torch +import math +import logging # from allennlp.common.checks import ConfigurationError # import allennlp.nn.util as util VITERBI_DECODING = Tuple[List[int], float] # a list of tags, and a viterbi score +logger = logging.get_logger(__name__) + def allowed_transitions( constraint_type: str, labels: Dict[int, str] diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py index 7f34719..7d03539 100644 --- a/dlkp/models/ke/extraction_utils.py +++ b/dlkp/models/ke/extraction_utils.py @@ -49,7 +49,7 @@ class ModelArguments: default=False, metadata={"help": "wether to use CRF on top of the classifier"}, ) - use_BiLSTM: bool = field( + use_BiLSTM: bool = field( # not necessary default=False, metadata={"help": "use BiLSTM in sequence classification"}, ) diff --git a/dlkp/models/ke/transformer/token_classification_models.py b/dlkp/models/ke/transformer/token_classification_models.py index d3b628c..d286ac1 100644 --- a/dlkp/models/ke/transformer/token_classification_models.py +++ b/dlkp/models/ke/transformer/token_classification_models.py @@ -89,6 +89,3 @@ def forward( # if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output - - -s From 4492c25f2bfb544ee583ff354fa87ca7436d4e7e Mon Sep 17 00:00:00 2001 From: Amardeep Date: Sat, 5 Feb 2022 01:19:56 +0530 Subject: [PATCH 10/17] added AutoModel for CRf classification --- dlkp/models/ke/transformer/crf_models.py | 71 ++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py index da26337..0d8624a 100644 --- a/dlkp/models/ke/transformer/crf_models.py +++ b/dlkp/models/ke/transformer/crf_models.py @@ -13,6 +13,77 @@ from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel +class AutoCRFforTokenClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = AutoModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # self.crf= nn.Linear(config.num_labels,1) + # self.crf= ConditionalRandomField(self.num_labels) + self.crf = ConditionalRandomField( + self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"} + ) + self.init_weights() + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_hidden_states=None, + output_attentions=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.bert( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = -self.crf(logits, labels, attention_mask) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + # print(self.crf.transitions) + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def freeze_till_clf(self): + for param in self.bert.parameters(): + param.requires_grad = False + for param in self.dropout.parameters(): + param.requires_grad = False + for param in self.classifier.parameters(): + param.requires_grad = False + + def freeze_encoder_layer(self): + for param in self.bert.parameters(): + param.requires_grad = False + + class BERT_CRFforTokenClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) From 5fbc15ad369ab8244c2bc9d4f516f4e3e04eb6d2 Mon Sep 17 00:00:00 2001 From: Amardeep Date: Sat, 5 Feb 2022 15:39:49 +0530 Subject: [PATCH 11/17] add runner --- dlkp/models/ke/extraction_utils.py | 19 +++---- dlkp/models/ke/kpe.py | 87 ++++++++++-------------------- examples/ke/run_auto_token_ke.py | 38 +++++++++++++ 3 files changed, 77 insertions(+), 67 deletions(-) create mode 100644 examples/ke/run_auto_token_ke.py diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py index 7d03539..cf913be 100644 --- a/dlkp/models/ke/extraction_utils.py +++ b/dlkp/models/ke/extraction_utils.py @@ -11,16 +11,17 @@ class ModelArguments: Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ - model_family_name: str = field( - metadata={ - "help": "name of the family of model, bert, longformer, reformer etc." - } - ) model_name_or_path: str = field( metadata={ "help": "Path to pretrained model or model identifier from huggingface.co/models" } ) + model_family_name: str = field( + default="auto", + metadata={ + "help": "name of the family of model, bert, longformer, reformer etc." + }, + ) config_name: Optional[str] = field( default=None, metadata={ @@ -62,7 +63,7 @@ class DataTrainingArguments: """ task_name: Optional[str] = field( - default="simple", metadata={"help": "The name of the task simple, crf"} + default="token", metadata={"help": "The name of the task token, crf"} ) train_file: Optional[str] = field( @@ -98,7 +99,7 @@ class DataTrainingArguments: }, ) label_all_tokens: bool = field( - default=False, + default=True, metadata={ "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " "one (in which case the other tokens will have a padding index)." @@ -111,11 +112,11 @@ class DataTrainingArguments: }, ) dataset_name: Optional[str] = field( - default=None, + default="midas/inspec", metadata={"help": "The name of the dataset to use (via the datasets library)."}, ) dataset_config_name: Optional[str] = field( - default=None, + default="extraction", metadata={ "help": "The configuration name of the dataset to use (via the datasets library)." }, diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index 721d284..643d3c2 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -45,16 +45,21 @@ ) from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformer.crf_models import BERT_CRFforTokenClassification +from transformer.crf_models import ( + BERT_CRFforTokenClassification, + AutoCRFforTokenClassification, +) from transformer.token_classification_models import LongformerForTokenClassification from crf.crf_trainer import CRF_Trainer from extraction_utils import ModelArguments, DataTrainingArguments +from kp_metrics.metrics import compute_metrics logger = logging.getLogger(__name__) CRF_MODEL_DICT = { "bert": BERT_CRFforTokenClassification, + "auto": AutoCRFforTokenClassification, # "longformer": Longformer_CRFforTokenClassification, } TOKEN_MODEL_DICT = { @@ -63,16 +68,16 @@ # "reformer": ReformerForTokenClassification, } -MODEL_DICT = {"crf": CRF_MODEL_DICT, "simple": TOKEN_MODEL_DICT} +MODEL_DICT = {"crf": CRF_MODEL_DICT, "token": TOKEN_MODEL_DICT} TRAINER_DICT = { "crf": CRF_Trainer, - "simple": Trainer, + "token": Trainer, } -def main_run_kpe(model_args, data_args, training_args): +def run_kpe(model_args, data_args, training_args): # See all possible arguments in src/transformers/training_args.py @@ -148,11 +153,13 @@ def main_run_kpe(model_args, data_args, training_args): else: column_names = datasets["validation"].column_names features = datasets["validation"].features - text_column_name = "text" if "text" in column_names else column_names[0] - label_column_name = "BIO_tags" if "BIO_tags" in column_names else column_names[1] + text_column_name = ( + "document" if "document" in column_names else column_names[1] + ) # either document or 2nd column as text i/p + label_column_name = ( + "doc_bio_tags" if "doc_bio_tags" in column_names else column_names[2] + ) # either doc_bio_tags column should be available or 3 rd columns will be considered as tag - # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the - # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: @@ -173,7 +180,7 @@ def get_label_list(labels): ) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) - print(label_to_id) + print("label to id", label_to_id) id2tag = {} for k in label_to_id.keys(): id2tag[label_to_id[k]] = k @@ -190,7 +197,7 @@ def get_label_list(labels): cache_dir=model_args.cache_dir, ) config.use_CRF = model_args.use_CRF ##CR replace from arguments - config.use_BiLSTM = False + config.use_BiLSTM = model_args.use_BiLSTM tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name @@ -210,7 +217,6 @@ def get_label_list(labels): print("model") # print(model) if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token config.pad_token_id = config.eos_token_id @@ -286,40 +292,6 @@ def tokenize_and_align_labels(examples): ) # Initialize our Trainer - # metric = load_metric("seqeval") - - # def compute_metrics(p): - # predictions, labels = p - # predictions = np.argmax(predictions, axis=2) - - # # Remove ignored index (special tokens) - # true_predictions = [ - # [label_list[p] for (p, l) in zip(prediction, label) if l != -100] - # for prediction, label in zip(predictions, labels) - # ] - # true_labels = [ - # [label_list[l] for (p, l) in zip(prediction, label) if l != -100] - # for prediction, label in zip(predictions, labels) - # ] - - # results = metric.compute(predictions=true_predictions, references=true_labels) - # if data_args.return_entity_level_metrics: - # # Unpack nested dictionaries - # final_results = {} - # for key, value in results.items(): - # if isinstance(value, dict): - # for n, v in value.items(): - # final_results[f"{key}_{n}"] = v - # else: - # final_results[key] = value - # return final_results - # else: - # return { - # "precision": results["overall_precision"], - # "recall": results["overall_recall"], - # "f1": results["overall_f1"], - # "accuracy": results["overall_accuracy"], - # } trainer = TRAINER_DICT[data_args.task_name]( model=model, @@ -359,19 +331,18 @@ def tokenize_and_align_labels(examples): # Evaluation results = {} - # if training_args.do_eval: - - # logger.info("*** Evaluate ***") - - # results = trainer.evaluate() - - # output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt") - # if trainer.is_world_process_zero(): - # with open(output_eval_file, "w") as writer: - # logger.info("***** Eval results *****") - # for key, value in results.items(): - # logger.info(f" {key} = {value}") - # writer.write(f"{key} = {value}\n") + if training_args.do_eval: + logger.info("*** Evaluate ***") + results = trainer.evaluate() + output_eval_file = os.path.join( + training_args.output_dir, "eval_results_KPE.txt" + ) + if trainer.is_world_process_zero(): + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key, value in results.items(): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") # Predict if training_args.do_predict: diff --git a/examples/ke/run_auto_token_ke.py b/examples/ke/run_auto_token_ke.py new file mode 100644 index 0000000..f84ff7e --- /dev/null +++ b/examples/ke/run_auto_token_ke.py @@ -0,0 +1,38 @@ +from dlkp.models.ke.kpe import run_kpe, TrainingArguments +from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments + +training_args = TrainingArguments( + output_dir="/media/nas_mount/Debanjan/amardeep/output/longformer_medium_kp20k_try", # todo + learning_rate=3e-5, + overwrite_output_dir=True, + num_train_epochs=4, + per_device_train_batch_size=2, + per_device_eval_batch_size=16, + gradient_accumulation_steps=4, + do_train=True, + do_eval=True, + evaluation_strategy="steps", + save_steps=1000, + eval_steps=1000, + # lr_scheduler_type= 'cosine', + warmup_steps=200, + logging_steps=100 + # weight_decay =0.001 +) +mdl_args = ModelArguments( + model_family_name="auto", + model_name_or_path="roberta-base", + use_CRF=False +) +data_args = DataTrainingArguments( + task_name="token", + # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json", + # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json", + dataset_name='midas/inspec', + dataset_config_name='extraction' + pad_to_max_length=True, + overwrite_cache=True, + label_all_tokens=True, + preprocessing_num_workers=8, + return_entity_level_metrics=True, +) From 9adf81e916b6a07fa04274a618af64c4f6f2e29a Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Sat, 5 Feb 2022 17:16:54 +0530 Subject: [PATCH 12/17] bug fixing for KPE --- dlkp/kp_metrics/metrics.py | 27 +++++++++---------- dlkp/models/ke/crf/crf.py | 2 +- dlkp/models/ke/crf/crf_utils.py | 2 +- dlkp/models/ke/kpe.py | 20 +++++++------- dlkp/models/ke/transformer/crf_models.py | 2 +- fre_usec_cmd.txt | 2 ++ ...n_auto_token_ke.py => run_auto_token_ke.py | 25 ++++++++--------- 7 files changed, 42 insertions(+), 38 deletions(-) create mode 100644 fre_usec_cmd.txt rename examples/ke/run_auto_token_ke.py => run_auto_token_ke.py (63%) diff --git a/dlkp/kp_metrics/metrics.py b/dlkp/kp_metrics/metrics.py index 33cc2e0..5901ec4 100644 --- a/dlkp/kp_metrics/metrics.py +++ b/dlkp/kp_metrics/metrics.py @@ -3,37 +3,36 @@ import numpy as np -def compute_metrics( - predictions, labels, return_entity_level_metrics=True, ignore_value=-100 -): - # predictions, labels = p - # print(predictions.shape, labels.shape) +def compute_metrics(p): + return_entity_level_metrics = False + ignore_value = -100 + predictions, labels = p + label_to_id = {"B": 0, "I": 1, "O": 2} + id_to_label = ["B", "I", "O"] # if model_args.use_CRF is False: predictions = np.argmax(predictions, axis=2) + # print(predictions.shape, labels.shape) # Remove ignored index (special tokens) true_predictions = [ - [p for (p, l) in zip(prediction, label) if l != ignore_value] + [id_to_label[p] for (p, l) in zip(prediction, label) if l != ignore_value] for prediction, label in zip(predictions, labels) ] true_labels = [ - [l for (p, l) in zip(prediction, label) if l != ignore_value] + [id_to_label[l] for (p, l) in zip(prediction, label) if l != ignore_value] for prediction, label in zip(predictions, labels) ] # results = metric.compute(predictions=true_predictions, references=true_labels) results = {} # print("cal precisi") + # mode="strict" results["overall_precision"] = precision_score( - true_labels, true_predictions, mode="strict", scheme=IOB2 - ) - results["overall_recall"] = recall_score( - true_labels, true_predictions, mode="strict", scheme=IOB2 + true_labels, true_predictions, scheme=IOB2 ) + results["overall_recall"] = recall_score(true_labels, true_predictions, scheme=IOB2) # print("cal f1") - results["overall_f1"] = f1_score( - true_labels, true_predictions, mode="strict", scheme=IOB2 - ) + results["overall_f1"] = f1_score(true_labels, true_predictions, scheme=IOB2) results["overall_accuracy"] = accuracy_score(true_labels, true_predictions) if return_entity_level_metrics: # Unpack nested dictionaries diff --git a/dlkp/models/ke/crf/crf.py b/dlkp/models/ke/crf/crf.py index 2ab2181..27786b5 100644 --- a/dlkp/models/ke/crf/crf.py +++ b/dlkp/models/ke/crf/crf.py @@ -1,6 +1,6 @@ # add models having crf classification layer with option of bilstm layers -from crf_utils import * +from .crf_utils import * from typing import List, Tuple, Dict, Union import torch diff --git a/dlkp/models/ke/crf/crf_utils.py b/dlkp/models/ke/crf/crf_utils.py index 45e3d6b..e9e3818 100644 --- a/dlkp/models/ke/crf/crf_utils.py +++ b/dlkp/models/ke/crf/crf_utils.py @@ -12,7 +12,7 @@ VITERBI_DECODING = Tuple[List[int], float] # a list of tags, and a viterbi score -logger = logging.get_logger(__name__) +# logger = logging.get_logger(__name__) def allowed_transitions( diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index 643d3c2..a137b28 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -45,14 +45,17 @@ ) from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformer.crf_models import ( +from dlkp.models.ke.transformer.crf_models import ( BERT_CRFforTokenClassification, AutoCRFforTokenClassification, ) -from transformer.token_classification_models import LongformerForTokenClassification -from crf.crf_trainer import CRF_Trainer -from extraction_utils import ModelArguments, DataTrainingArguments -from kp_metrics.metrics import compute_metrics +from dlkp.models.ke.transformer.token_classification_models import ( + LongformerForTokenClassification, +) +from dlkp.models.ke.crf.crf_trainer import CRF_Trainer + +# from extraction_utils import ModelArguments, DataTrainingArguments +from dlkp.kp_metrics.metrics import compute_metrics logger = logging.getLogger(__name__) @@ -64,6 +67,7 @@ } TOKEN_MODEL_DICT = { "bert": BertForTokenClassification, + "auto": AutoModelForTokenClassification # "longformer": LongformerForTokenClassification, # "reformer": ReformerForTokenClassification, } @@ -250,10 +254,8 @@ def tokenize_and_align_labels(examples): # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: - # label_ids.append(-100) - label_ids.append( - 2 - ) # to avoid error change -100 to 'O' tag i.e. 2 class + label_ids.append(-100) + # label_ids.append(2) # to avoid error change -100 to 'O' tag i.e. 2 class # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py index 0d8624a..db8c4d6 100644 --- a/dlkp/models/ke/transformer/crf_models.py +++ b/dlkp/models/ke/transformer/crf_models.py @@ -9,7 +9,7 @@ ) from transformers.modeling_outputs import TokenClassifierOutput import collections -from crf import ConditionalRandomField +from dlkp.models.ke.crf.crf import ConditionalRandomField from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel diff --git a/fre_usec_cmd.txt b/fre_usec_cmd.txt new file mode 100644 index 0000000..249a440 --- /dev/null +++ b/fre_usec_cmd.txt @@ -0,0 +1,2 @@ +source ../.dlkp_venv/bin/activate +conda deactivate \ No newline at end of file diff --git a/examples/ke/run_auto_token_ke.py b/run_auto_token_ke.py similarity index 63% rename from examples/ke/run_auto_token_ke.py rename to run_auto_token_ke.py index f84ff7e..cb06a20 100644 --- a/examples/ke/run_auto_token_ke.py +++ b/run_auto_token_ke.py @@ -1,38 +1,39 @@ +from statistics import mode from dlkp.models.ke.kpe import run_kpe, TrainingArguments from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments training_args = TrainingArguments( - output_dir="/media/nas_mount/Debanjan/amardeep/output/longformer_medium_kp20k_try", # todo + output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_debug", # todo learning_rate=3e-5, overwrite_output_dir=True, - num_train_epochs=4, - per_device_train_batch_size=2, + num_train_epochs=5, + per_device_train_batch_size=4, per_device_eval_batch_size=16, - gradient_accumulation_steps=4, + # gradient_accumulation_steps=4, do_train=True, do_eval=True, evaluation_strategy="steps", save_steps=1000, - eval_steps=1000, + eval_steps=100, # lr_scheduler_type= 'cosine', - warmup_steps=200, + # warmup_steps=200, logging_steps=100 # weight_decay =0.001 ) mdl_args = ModelArguments( - model_family_name="auto", - model_name_or_path="roberta-base", - use_CRF=False + model_family_name="auto", model_name_or_path="roberta-base", use_CRF=False ) data_args = DataTrainingArguments( task_name="token", # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json", # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json", - dataset_name='midas/inspec', - dataset_config_name='extraction' + dataset_name="midas/inspec", + dataset_config_name="extraction", pad_to_max_length=True, overwrite_cache=True, label_all_tokens=True, preprocessing_num_workers=8, - return_entity_level_metrics=True, + # return_entity_level_metrics=True, ) + +run_kpe(mdl_args, data_args, training_args) From c022b7d70c1605559081cd6d94f04b24a19e546f Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Sat, 5 Feb 2022 20:30:02 +0530 Subject: [PATCH 13/17] crf bug fixes --- dlkp/models/ke/kpe.py | 1 + dlkp/models/ke/transformer/crf_models.py | 4 ++-- fre_usec_cmd.txt | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index a137b28..06dbaeb 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -183,6 +183,7 @@ def get_label_list(labels): else datasets["validation"][label_column_name] ) label_to_id = {l: i for i, l in enumerate(label_list)} + label_to_id = {"B": 0, "I": 1, "O": 2} num_labels = len(label_list) print("label to id", label_to_id) id2tag = {} diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py index db8c4d6..b0c4c68 100644 --- a/dlkp/models/ke/transformer/crf_models.py +++ b/dlkp/models/ke/transformer/crf_models.py @@ -17,7 +17,7 @@ class AutoCRFforTokenClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels - self.bert = AutoModel(config) + self.base_model = AutoModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # self.crf= nn.Linear(config.num_labels,1) @@ -43,7 +43,7 @@ def forward( return_dict if return_dict is not None else self.config.use_return_dict ) - outputs = self.bert( + outputs = self.base_model( input_ids, position_ids=position_ids, attention_mask=attention_mask, diff --git a/fre_usec_cmd.txt b/fre_usec_cmd.txt index 249a440..21665ff 100644 --- a/fre_usec_cmd.txt +++ b/fre_usec_cmd.txt @@ -1,2 +1,3 @@ source ../.dlkp_venv/bin/activate -conda deactivate \ No newline at end of file +conda deactivate +CUDA_VISIBLE_DEVICES=1 python run_auto_token_ke.py \ No newline at end of file From d72e67fd679ce70bf7e65a5811a8567da4b29863 Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Sun, 6 Feb 2022 23:17:45 +0530 Subject: [PATCH 14/17] fixed crf bugs --- dlkp/models/ke/crf/crf_trainer.py | 22 ++++++++------- dlkp/models/ke/transformer/crf_models.py | 35 ++++++++++++++---------- run_auto_token_ke.py | 10 +++---- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/dlkp/models/ke/crf/crf_trainer.py b/dlkp/models/ke/crf/crf_trainer.py index 5db3178..073f29e 100644 --- a/dlkp/models/ke/crf/crf_trainer.py +++ b/dlkp/models/ke/crf/crf_trainer.py @@ -208,24 +208,26 @@ def compute_loss(self, model, inputs, return_outputs=False): Subclass and override for custom behavior. """ - # if self.label_smoother is not None and "labels" in inputs: + # labels = inputs.pop("labels") + # assert "labels" in inputs + # if "labels" in inputs: # labels = inputs.pop("labels") # else: - labels = None + # labels = None # print(model) - # assert "labels" in inputs - # print(type(inputs),inputs) + # print(type(inputs), inputs.keys()) outputs = model(**inputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. if self.args.past_index >= 0: self._past = outputs[self.args.past_index] - if labels is not None: - loss = self.label_smoother(outputs, labels) - else: - # We don't use .loss here since the model may return tuples instead of ModelOutput. - # print(outputs) - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + # if labels is not None: + # # print("labels is not None") + # loss = self.label_smoother(outputs, labels) + # else: + # # We don't use .loss here since the model may return tuples instead of ModelOutput. + # print(outputs.keys()) + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # print("loss is ", loss) return (loss, outputs) if return_outputs else loss diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py index b0c4c68..5d2d4db 100644 --- a/dlkp/models/ke/transformer/crf_models.py +++ b/dlkp/models/ke/transformer/crf_models.py @@ -6,6 +6,8 @@ BertPreTrainedModel, LongformerModel, PreTrainedModel, + AutoModelForTokenClassification, + # PretrainedModel, ) from transformers.modeling_outputs import TokenClassifierOutput import collections @@ -13,17 +15,20 @@ from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel -class AutoCRFforTokenClassification(BertPreTrainedModel): +class AutoCRFforTokenClassification(AutoModelForTokenClassification): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.base_model = AutoModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) + # self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # self.crf= nn.Linear(config.num_labels,1) # self.crf= ConditionalRandomField(self.num_labels) self.crf = ConditionalRandomField( - self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"} + self.num_labels, + label_encoding="BIO", + idx2tag={0: "B", 1: "I", 2: "0"}, + include_start_end_transitions=False, ) self.init_weights() @@ -89,7 +94,7 @@ def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) + # self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # self.crf= nn.Linear(config.num_labels,1) # self.crf= ConditionalRandomField(self.num_labels) @@ -142,17 +147,17 @@ def forward( attentions=outputs.attentions, ) - def freeze_till_clf(self): - for param in self.bert.parameters(): - param.requires_grad = False - for param in self.dropout.parameters(): - param.requires_grad = False - for param in self.classifier.parameters(): - param.requires_grad = False - - def freeze_encoder_layer(self): - for param in self.bert.parameters(): - param.requires_grad = False + # def freeze_till_clf(self): + # for param in self.bert.parameters(): + # param.requires_grad = False + # for param in self.dropout.parameters(): + # param.requires_grad = False + # for param in self.classifier.parameters(): + # param.requires_grad = False + + # def freeze_encoder_layer(self): + # for param in self.bert.parameters(): + # param.requires_grad = False class Longformer_CRFforTokenClassification(LongformerPreTrainedModel): diff --git a/run_auto_token_ke.py b/run_auto_token_ke.py index cb06a20..7251c75 100644 --- a/run_auto_token_ke.py +++ b/run_auto_token_ke.py @@ -3,12 +3,12 @@ from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments training_args = TrainingArguments( - output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_debug", # todo + output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_crf_debug", # todo learning_rate=3e-5, overwrite_output_dir=True, num_train_epochs=5, - per_device_train_batch_size=4, - per_device_eval_batch_size=16, + per_device_train_batch_size=2, + per_device_eval_batch_size=2, # gradient_accumulation_steps=4, do_train=True, do_eval=True, @@ -21,10 +21,10 @@ # weight_decay =0.001 ) mdl_args = ModelArguments( - model_family_name="auto", model_name_or_path="roberta-base", use_CRF=False + model_family_name="auto", model_name_or_path="roberta-base", use_CRF=True ) data_args = DataTrainingArguments( - task_name="token", + task_name="crf", # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json", # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json", dataset_name="midas/inspec", From f1b8dc933a0855a7bc11312d23d7ded12e2ea2b4 Mon Sep 17 00:00:00 2001 From: Amardeep Kumar Date: Sun, 6 Feb 2022 23:43:49 +0530 Subject: [PATCH 15/17] reformatting and refactoring --- dlkp/models/ke/extraction_utils.py | 1 + dlkp/models/ke/kpe.py | 9 ++++++--- dlkp/models/ke/transformer/crf_models.py | 2 -- run_auto_token_ke.py => run_auto_ke.py | 21 ++++++++++++--------- 4 files changed, 19 insertions(+), 14 deletions(-) rename run_auto_token_ke.py => run_auto_ke.py (70%) diff --git a/dlkp/models/ke/extraction_utils.py b/dlkp/models/ke/extraction_utils.py index cf913be..0901e0b 100644 --- a/dlkp/models/ke/extraction_utils.py +++ b/dlkp/models/ke/extraction_utils.py @@ -3,6 +3,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +from transformers import TrainingArguments @dataclass diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index 06dbaeb..a2078a9 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -211,9 +211,12 @@ def get_label_list(labels): use_fast=True, add_prefix_space=True, ) - model = MODEL_DICT[data_args.task_name][ - model_args.model_family_name - ].from_pretrained( + model = ( + AutoCRFforTokenClassification + if model_args.use_CRF + else AutoModelForTokenClassification + ) + model = model.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py index 5d2d4db..7cf7810 100644 --- a/dlkp/models/ke/transformer/crf_models.py +++ b/dlkp/models/ke/transformer/crf_models.py @@ -22,8 +22,6 @@ def __init__(self, config): self.base_model = AutoModel(config) # self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - # self.crf= nn.Linear(config.num_labels,1) - # self.crf= ConditionalRandomField(self.num_labels) self.crf = ConditionalRandomField( self.num_labels, label_encoding="BIO", diff --git a/run_auto_token_ke.py b/run_auto_ke.py similarity index 70% rename from run_auto_token_ke.py rename to run_auto_ke.py index 7251c75..84ce2b5 100644 --- a/run_auto_token_ke.py +++ b/run_auto_ke.py @@ -1,14 +1,18 @@ from statistics import mode -from dlkp.models.ke.kpe import run_kpe, TrainingArguments -from dlkp.models.ke.extraction_utils import DataTrainingArguments, ModelArguments +from dlkp.models.ke.kpe import run_kpe +from dlkp.models.ke.extraction_utils import ( + DataTrainingArguments, + ModelArguments, + TrainingArguments, +) training_args = TrainingArguments( output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_crf_debug", # todo learning_rate=3e-5, overwrite_output_dir=True, num_train_epochs=5, - per_device_train_batch_size=2, - per_device_eval_batch_size=2, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, # gradient_accumulation_steps=4, do_train=True, do_eval=True, @@ -20,11 +24,8 @@ logging_steps=100 # weight_decay =0.001 ) -mdl_args = ModelArguments( - model_family_name="auto", model_name_or_path="roberta-base", use_CRF=True -) +model_args = ModelArguments(model_name_or_path="roberta-base", use_CRF=True) data_args = DataTrainingArguments( - task_name="crf", # train_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/train.json", # validation_file="/media/nas_mount/Debanjan/amardeep/proc_data/kp20k/medium/conll/test.json", dataset_name="midas/inspec", @@ -36,4 +37,6 @@ # return_entity_level_metrics=True, ) -run_kpe(mdl_args, data_args, training_args) +run_kpe(model_args, data_args, training_args) + +# CUDA_VISIBLE_DEVICES=0 python run_auto_ke.py From e19ae34df8fdbed4f7b8c7ae1a0fd258c0ceabe9 Mon Sep 17 00:00:00 2001 From: Amardeep Date: Thu, 10 Feb 2022 12:29:25 +0530 Subject: [PATCH 16/17] add relative path --- dlkp/models/ke/kpe.py | 8 ++++---- dlkp/models/ke/transformer/crf_models.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dlkp/models/ke/kpe.py b/dlkp/models/ke/kpe.py index a2078a9..996bfd3 100644 --- a/dlkp/models/ke/kpe.py +++ b/dlkp/models/ke/kpe.py @@ -45,17 +45,17 @@ ) from transformers.trainer_utils import get_last_checkpoint, is_main_process -from dlkp.models.ke.transformer.crf_models import ( +from .transformer.crf_models import ( BERT_CRFforTokenClassification, AutoCRFforTokenClassification, ) -from dlkp.models.ke.transformer.token_classification_models import ( +from .transformer.token_classification_models import ( LongformerForTokenClassification, ) -from dlkp.models.ke.crf.crf_trainer import CRF_Trainer +from .crf.crf_trainer import CRF_Trainer # from extraction_utils import ModelArguments, DataTrainingArguments -from dlkp.kp_metrics.metrics import compute_metrics +from ...kp_metrics.metrics import compute_metrics logger = logging.getLogger(__name__) diff --git a/dlkp/models/ke/transformer/crf_models.py b/dlkp/models/ke/transformer/crf_models.py index 7cf7810..75f2fff 100644 --- a/dlkp/models/ke/transformer/crf_models.py +++ b/dlkp/models/ke/transformer/crf_models.py @@ -11,8 +11,8 @@ ) from transformers.modeling_outputs import TokenClassifierOutput import collections -from dlkp.models.ke.crf.crf import ConditionalRandomField from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel +from ..crf.crf import ConditionalRandomField class AutoCRFforTokenClassification(AutoModelForTokenClassification): From 7eda3011be9accca0c61546a707d2d33bfef9729 Mon Sep 17 00:00:00 2001 From: Amardeep Date: Fri, 11 Feb 2022 19:44:00 +0530 Subject: [PATCH 17/17] add setup.py for package --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8441906..f9ab705 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,8 @@ description="A deep learning library for keyphrase extraction and generation", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", - author="Debanjan Mahata", - author_email="debanjanmahata85@gmail.com", + author="Amardeep Kumar || Debanjan Mahata", + author_email="Kumaramardipsingh@gmail.com || debanjanmahata85@gmail.com", url="https://github.com/midas-research/dlkp", packages=find_packages(exclude="tests"), # same as name license="Apache License Version 2.0",