[dh] apply on model

bcaitech1 · PrimeOfMine · May 25, 2021 · May 27, 2021 · May 28, 2021 · May 28, 2021
commit 40fb49c0d711d3a1b21ed26fa1abad14a9553e95
diff --git a/conf.yml b/conf.yml
@@ -1,11 +1,11 @@
-model : lstm   # {lstm, lstmattn, bert, lgbm, lstmroberta, lastquery, saint, lstmalbertattn}
+model : saint   # {lstm, lstmattn, bert, lgbm, lstmroberta, lastquery, saint, lstmalbertattn}
 
 # (비일반화) : base feature만 사용
 # - lstm,lstmattn,bert
 # (일반화) : 추가한 컬럼까지 범주형으로 사용
 
 wandb :
-    using: False
+    using: True
     project: DKT
 
     ## 자신의 wandb 아이디를 적어주세요
@@ -14,7 +14,7 @@ wandb :
         - baseline
 
 ##main params
-task_name: lstm_time_test_nokfold
+task_name: saint_test_kfold
 seed: 42
 device: cuda
 
@@ -30,10 +30,10 @@ max_seq_len: 128
 num_workers: 1
 
 ##K-fold params
-use_kfold : False #n개의 fold를 이용하여 k-fold를 진행한다.
-use_stratify : False
+use_kfold : True #n개의 fold를 이용하여 k-fold를 진행한다.
+use_stratify : True
 n_fold : 5
-split_by_user : False #k-fold를 수행할 dataset을 user 기준으로 split
+split_by_user : True #k-fold를 수행할 dataset을 user 기준으로 split
 
 ##모델
 hidden_dim : 256

diff --git a/dkt/dataloader.py b/dkt/dataloader.py
@@ -75,7 +75,7 @@ def __preprocessing(self, df, is_train = True):
         if not os.path.exists(self.args.asset_dir):
             os.makedirs(self.args.asset_dir)
 
-        for col in cate_cols:
+        for col in cate_cols[1:]:
             le = LabelEncoder()
             if is_train:
                 #For UNKNOWN class
@@ -94,7 +94,6 @@ def __preprocessing(self, df, is_train = True):
 
         #cate feat들의 이름 / 고유값 개수를 dict로 conf에 저장
         self.args.cate_feat_dict=dict(zip(cate_cols,[len(df[col].unique()) for col in cate_cols]))
-        print("preprocessing하고 나서 user개수",len(df['userID'].unique()))
         return df
 
     def __feature_engineering(self, df):
@@ -145,6 +144,10 @@ def load_data_from_file(self, file_name, is_train=True):
         df = self.__feature_engineering(df)
         df = self.__preprocessing(df, is_train)
 
+        df['userID']=df['userID'].astype(int)
+        df['KnowledgeTag']=df['KnowledgeTag'].astype(int)
+
+
         #column은 cate feats 다음에 cont_feats가 오며 cate feats의 처음은 userid, cont_feats의 처음 피처는 answerCode임
         columns=self.args['cate_feats']+self.args['cont_feats']
         print(columns)
@@ -160,8 +163,8 @@ def load_data_from_file(self, file_name, is_train=True):
                 lambda r: tuple([r[i].values for i in ret])
             )
         print(group)
-        # print(f"유저수 {len(group)} 피처수 {len(group[0])} 푼 문제 수 {len(group[0][0])}")
-        # len(f'group.values->{len(group.values)}')
+        print(f"유저수 {len(group)} 피처수 {len(group.iloc[0])} 푼 문제 수 {len(group.iloc[0][0])}")
+        len(f'group.values->{len(group.values)}')
         print("load data 후",len(df['userID'].unique()))
         return group.values
 

diff --git a/dkt/models_architecture/lastquery_post.py b/dkt/models_architecture/lastquery_post.py
@@ -49,29 +49,29 @@ def __init__(self, args):
         self.hidden_dim = self.args.hidden_dim
 
         # Embedding 
-        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
+        #userID때문에 하나 뺌
+        cate_len=len(args.cate_feats)-1
+        #answerCode 때문에 하나 뺌
+        cont_len=len(args.cont_feats)-1
 
-        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
-        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
-        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
-        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
-        self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)
-
-        self.n_other_features = self.args.n_other_features
-        print(self.n_other_features)
-
-        # encoder combination projection
-        self.comb_proj = nn.Linear((self.hidden_dim//3)*(4+len(self.n_other_features)), self.hidden_dim)
+        # Embedding 
+        # cate Embedding 
+        self.cate_embedding_list = nn.ModuleList([nn.Embedding(max_val+1, (self.hidden_dim//2)//cate_len) for max_val in list(args.cate_feat_dict.values())[1:]]) 
+        # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
+        self.embedding_interaction = nn.Embedding(3, (self.hidden_dim//2)//cate_len)
 
-        # # other feature
-        self.f_cnt = len(self.n_other_features) # feature의 개수
-        self.embedding_other_features = [nn.Embedding(self.n_other_features[i]+1, self.hidden_dim//3) for i in range(self.f_cnt)]
+        # cont Embedding
+        self.cont_embedding = nn.Linear(1, (self.hidden_dim//2)//cont_len)
 
 
         # 기존 keetar님 솔루션에서는 Positional Embedding은 사용되지 않습니다
         # 하지만 사용 여부는 자유롭게 결정해주세요 :)
         self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)
 
+        # comb linear
+        self.cate_comb_proj = nn.Linear(((self.hidden_dim//2)//cate_len)*(cate_len+1), self.hidden_dim//2) #interaction을 나중에 더하므로 +1
+        self.cont_comb_proj = nn.Linear(((self.hidden_dim//2)//cont_len)*cont_len, self.hidden_dim//2)
+
         # Encoder
         self.query = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
         self.key = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
@@ -140,54 +140,44 @@ def init_hidden(self, batch_size):
 
     def forward(self, input):
 
-        # test, question, tag, _, mask, interaction, index = input
-
-        # for i,e in enumerate(input):
-        #     print(f'i 번째 : {e[i].shape}')
-        test = input[0]
-        question = input[1]
-        tag = input[2]
-
-        mask = input[4]
-        interaction = input[5]
-
-        other_features = [input[i] for i in range(6,len(input)-1)]
-
-        index = input[len(input)-1]
-
+        #userID가 빠졌으므로 -1
+        cate_feats=input[:len(self.args.cate_feats)-1]
+        # print("cate_feats개수",len(cate_feats))
+
+        #answercode가 없으므로 -1
+        cont_feats=input[len(self.args.cate_feats)-1:-4]
+        # print("cont_feats개수",len(cont_feats))      
+        interaction=input[-4]
+        mask=input[-3]
+        gather_index=input[-2]
 
         batch_size = interaction.size(0)
         seq_len = interaction.size(1)
 
-        # 신나는 embedding
+         # cate Embedding
+        cate_feats_embed=[]
         embed_interaction = self.embedding_interaction(interaction)
+        cate_feats_embed.append(embed_interaction)
 
-        embed_test = self.embedding_test(test)
-        embed_question = self.embedding_question(question)
-        embed_tag = self.embedding_tag(tag)
-
-        # dev
-        embed_other_features =[] 
+        for i, cate_feat in enumerate(cate_feats): 
+            cate_feats_embed.append(self.cate_embedding_list[i](cate_feat))
 
-        for i,e in enumerate(self.embedding_other_features):
-            # print(f'{i}번째 : {e}')
-            # print(f'최댓값(전) : {torch.max(other_features[i])}')
-            # print(f'최솟값(전) : {torch.min(other_features[i])}')
-            embed_other_features.append(e(other_features[i]))
-            # print(f'최댓값(후) : {torch.max(other_features[i])}')
-            # print(f'최솟값(후) : {torch.min(other_features[i])}')
-
-        cat_list = [embed_interaction,
-                           embed_test,
-                           embed_question,
-                           embed_tag,
-                           ]
-        cat_list.extend(embed_other_features)
+        # unsqueeze cont feats shape & embedding
+        cont_feats_embed=[]
+        for cont_feat in cont_feats:
+            cont_feat=cont_feat.unsqueeze(-1)
+            cont_feats_embed.append(self.cont_embedding(cont_feat))
+
 
+        #concat cate, cont feats
+        embed_cate = torch.cat(cate_feats_embed, 2)
+        embed_cate=self.cate_comb_proj(embed_cate)
+
+        embed_cont = torch.cat(cont_feats_embed, 2)
+        embed_cont=self.cont_comb_proj(embed_cont)
 
-        embed = torch.cat(cat_list, 2)
 
-        embed = self.comb_proj(embed)
+        embed = torch.cat([embed_cate,embed_cont], 2)
 
         # Positional Embedding
         # last query에서는 positional embedding을 하지 않음
@@ -199,15 +189,15 @@ def forward(self, input):
         q = self.query(embed)
 
         # 이 3D gathering은 머리가 아픕니다. 잠시 머리를 식히고 옵니다.
-        q = torch.gather(q, 1, index.repeat(1, self.hidden_dim).unsqueeze(1))
+        q = torch.gather(q, 1, gather_index.repeat(1, self.hidden_dim).unsqueeze(1))
         q = q.permute(1, 0, 2)
 
         k = self.key(embed).permute(1, 0, 2)
         v = self.value(embed).permute(1, 0, 2)
 
         ## attention
         # last query only
-        self.mask = self.get_mask(seq_len, index, batch_size).to(self.device)
+        self.mask = self.get_mask(seq_len, gather_index, batch_size).to(self.device)
         out, _ = self.attn(q, k, v, attn_mask=self.mask)
 
         ## residual + layer norm

diff --git a/dkt/models_architecture/lastquery_pre.py b/dkt/models_architecture/lastquery_pre.py
@@ -50,29 +50,29 @@ def __init__(self, args):
         self.device = args.device
 
         self.hidden_dim = self.args.hidden_dim
-        self.cont_cols=self.args.cont_cols
+        #userID때문에 하나 뺌
+        cate_len=len(args.cate_feats)-1
+        #answerCode 때문에 하나 뺌
+        cont_len=len(args.cont_feats)-1
+
         # Embedding 
-        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
-        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
-        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
-        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
-        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
-        # self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)
+        # cate Embedding 
+        self.cate_embedding_list = nn.ModuleList([nn.Embedding(max_val+1, (self.hidden_dim//2)//cate_len) for max_val in list(args.cate_feat_dict.values())[1:]]) 
+        # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
+        self.embedding_interaction = nn.Embedding(3, (self.hidden_dim//2)//cate_len)
+
+        # cont Embedding
+        self.cont_embedding = nn.Linear(1, (self.hidden_dim//2)//cont_len)
 
 
         # 기존 keetar님 솔루션에서는 Positional Embedding은 사용되지 않습니다
         # 하지만 사용 여부는 자유롭게 결정해주세요 :)
         # self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)
-        # self.n_other_features = self.args.n_other_features
-        # print(self.n_other_features)
-        self.cont_proj=nn.Linear(self.cont_cols,self.hidden_dim//2)
-
-        # encoder combination projection
-        self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim//2)
+
+        # comb linear
+        self.cate_comb_proj = nn.Linear(((self.hidden_dim//2)//cate_len)*(cate_len+1), self.hidden_dim//2) #interaction을 나중에 더하므로 +1
+        self.cont_comb_proj = nn.Linear(((self.hidden_dim//2)//cont_len)*cont_len, self.hidden_dim//2)
 
-        # # other feature
-        # self.f_cnt = len(self.n_other_features) # feature의 개수
-        # self.embedding_other_features = [nn.Embedding(self.n_other_features[i]+1, self.hidden_dim//3) for i in range(self.f_cnt)]
 
         # Encoder
         self.query = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
@@ -120,41 +120,44 @@ def init_hidden(self, batch_size):
 
 
     def forward(self, input):
-        test, question,tag, correct, mask, interaction, solve_time, gather_index=input
+        #userID가 빠졌으므로 -1
+        cate_feats=input[:len(self.args.cate_feats)-1]
+        # print("cate_feats개수",len(cate_feats))
+
+        #answercode가 없으므로 -1
+        cont_feats=input[len(self.args.cate_feats)-1:-4]
+        # print("cont_feats개수",len(cont_feats))      
+        interaction=input[-4]
+        mask=input[-3]
+        gather_index=input[-2]
 
         batch_size = interaction.size(0)
         seq_len = interaction.size(1)
 
-        solve_time=solve_time.unsqueeze(-1) #shape(B,MSL) -> shape(B, MSL, 1)
-        # 신나는 embedding
+        # cate Embedding
+        cate_feats_embed=[]
         embed_interaction = self.embedding_interaction(interaction)
-        embed_test = self.embedding_test(test)
-        embed_question = self.embedding_question(question)
-        embed_tag = self.embedding_tag(tag)
+        cate_feats_embed.append(embed_interaction)
 
-        embed_cont=self.cont_proj(solve_time)
-        # dev
-
-        # for i,e in enumerate(self.embedding_other_features):
-        #     # print(f'{i}번째 : {e}')
-        #     # print(f'최댓값(전) : {torch.max(other_features[i])}')
-        #     # print(f'최솟값(전) : {torch.min(other_features[i])}')
-        #     embed_other_features.append(e(other_features[i]))
-        #     # print(f'최댓값(후) : {torch.max(other_features[i])}')
-        #     # print(f'최솟값(후) : {torch.min(other_features[i])}')
-
-        cat_list = [embed_interaction,
-                           embed_test,
-                           embed_question,
-                           embed_tag,
-                           ]
-        # cat_list.extend(embed_other_features)
-
-        embed = torch.cat(cat_list, 2)
-
-
-        embed = self.comb_proj(embed)
-        embed=torch.cat([embed, embed_cont], 2) #(batch,msl, 128)
+        for i, cate_feat in enumerate(cate_feats): 
+            cate_feats_embed.append(self.cate_embedding_list[i](cate_feat))
+
+        # unsqueeze cont feats shape & embedding
+        cont_feats_embed=[]
+        for cont_feat in cont_feats:
+            cont_feat=cont_feat.unsqueeze(-1)
+            cont_feats_embed.append(self.cont_embedding(cont_feat))
+
+
+        #concat cate, cont feats
+        embed_cate = torch.cat(cate_feats_embed, 2)
+        embed_cate=self.cate_comb_proj(embed_cate)
+
+        embed_cont = torch.cat(cont_feats_embed, 2)
+        embed_cont=self.cont_comb_proj(embed_cont)
+
+
+        embed = torch.cat([embed_cate,embed_cont], 2)
 
         # Positional Embedding
         # last query에서는 positional embedding을 하지 않음

diff --git a/dkt/models_architecture/lstm.py b/dkt/models_architecture/lstm.py
@@ -22,7 +22,7 @@ def __init__(self, args):
 
         self.hidden_dim = self.args.hidden_dim
         self.n_layers = self.args.n_layers
-        self.cont_cols=1
+
         #userID때문에 하나 뺌
         cate_len=len(args.cate_feats)-1
         #answerCode 때문에 하나 뺌