diff --git a/ark_nlp/dataset/global_pointer_named_entity_recognition_dataset.py b/ark_nlp/dataset/global_pointer_named_entity_recognition_dataset.py index f18d8e1..d713615 100644 --- a/ark_nlp/dataset/global_pointer_named_entity_recognition_dataset.py +++ b/ark_nlp/dataset/global_pointer_named_entity_recognition_dataset.py @@ -66,7 +66,7 @@ def _convert_to_transfomer_ids(self, bert_tokenizer): continue global_label[self.cat2id[info_['type']], start_idx+1, end_idx+1] = 1 - global_label = torch.tensor(global_label).to_sparse() + global_label = global_label.to_sparse() features.append({ 'input_ids': input_ids, diff --git a/ark_nlp/model/ner/crf_bert/__init__.py b/ark_nlp/model/ner/crf_bert/__init__.py index 35ef16c..62226e4 100644 --- a/ark_nlp/model/ner/crf_bert/__init__.py +++ b/ark_nlp/model/ner/crf_bert/__init__.py @@ -13,8 +13,8 @@ from ark_nlp.factory.optimizer import get_default_crf_bert_optimizer as get_default_model_optimizer from ark_nlp.factory.optimizer import get_default_crf_bert_optimizer as get_default_crf_bert_optimizer -from ark_nlp.factory.task import BIONERTask as Task -from ark_nlp.factory.task import BIONERTask as CrfBertNERTask +from ark_nlp.factory.task import CRFNERTask as Task +from ark_nlp.factory.task import CRFNERTask as CrfBertNERTask -from ark_nlp.factory.predictor import BIONERPredictor as Predictor -from ark_nlp.factory.predictor import BIONERPredictor as CrfBertNERPredictor \ No newline at end of file +from ark_nlp.factory.predictor import CRFNERPredictor as Predictor +from ark_nlp.factory.predictor import CRFNERPredictor as CrfBertNERPredictor \ No newline at end of file diff --git a/ark_nlp/processor/tokenizer/transfomer.py b/ark_nlp/processor/tokenizer/transfomer.py index 75b3e8f..37aeabe 100644 --- a/ark_nlp/processor/tokenizer/transfomer.py +++ b/ark_nlp/processor/tokenizer/transfomer.py @@ -229,8 +229,13 @@ class TokenTokenizer(TransfomerTokenizer): def tokenize(self, text, **kwargs): tokens = [] - text = ' '.join([token_ for token_ in text]) - tokens = self.vocab.tokenize(text) + for token_ in text: + tokenized_token_ = self.vocab.tokenize(token_) + if tokenized_token_ == []: + tokens.extend([token_]) + else: + tokens.extend(tokenized_token_) + return tokens def sequence_to_ids(self, sequence, **kwargs):