-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRankLips.hs
300 lines (252 loc) · 16.1 KB
/
RankLips.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE PartialTypeSignatures #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE TemplateHaskell #-}
module Main where
import Development.GitRev
import Control.Monad
import Data.Semigroup hiding (All, Any, option)
import Options.Applicative
import qualified Options.Applicative.Help.Pretty as Pretty
import System.FilePath
import Control.Concurrent (setNumCapabilities)
import qualified Data.Set as S
import qualified Data.Text as T
import Data.List
import Data.Maybe
import qualified Data.List.Split as Split
import System.Directory
import Data.Aeson as Aeson
import qualified Data.ByteString.Lazy.Char8 as BSL
import qualified SimplIR.Format.TrecRunFile as SimplirRun
import SimplIR.LearningToRank
import SimplIR.LearningToRankWrapper
import qualified SimplIR.FeatureSpace as F
import RankLipsTypes
import RankLipsCompat
import FeaturesAndSetup
import JsonRunQrels
import RankLipsFeatureUtils
-- import Debug.Trace as Debug
minibatchParser :: Parser MiniBatchParams
minibatchParser = MiniBatchParams
<$> option auto (long "mini-batch-steps" <> metavar "STEPS" <> value defSteps
<> help ("iterations per mini-batch, default: "<> show defSteps))
<*> option auto (long "mini-batch-size" <> metavar "SIZE" <> value defBatchSize
<> help ("number of mini-batch training queries, default: " <> show defBatchSize))
<*> option auto (long "mini-batch-eval" <> metavar "EVAL" <> value defEvalSteps
<> help ("number of mini-batches before next training evaluation, default: " <> show defEvalSteps))
where MiniBatchParams { miniBatchParamsBatchSteps = defSteps
, miniBatchParamsBatchSize = defBatchSize
, miniBatchParamsEvalSteps = defEvalSteps
} = defaultMiniBatchParams
defaultConvergenceParams :: ConvergenceDiagParams
defaultConvergenceParams = ConvergenceDiagParams 10e-2 1000 0 (EvalCutoffAt 100) 5 5
convergenceParamParser :: Parser ConvergenceDiagParams
convergenceParamParser =
ConvergenceDiagParams
<$> option auto (long "convergence-threshold" <> metavar "FACTOR" <> value defThresh
<> help ("being converged means that relative change in MAP between iterations is less than FACTOR, default: "<> show defThresh))
<*> option auto (long "convergence-max-iter" <> metavar "ITER" <> value defIter
<> help ("max number of iterations after which training is stopped (use to avoid loops), default: "<> show defIter))
<*> option auto (long "convergence-drop-initial-iterations" <> metavar "ITER" <> value defDrop
<> help ("number of initial iterations to disregard before convergence is monitored, default: "<> show defDrop))
<*> option (EvalCutoffAt <$> auto) (long "convergence-eval-cutoff" <> metavar "K" <> value (EvalCutoffAt defEvalCutoff)
<> help ("Training MAP will only be evaluated on top K (saves runtime), default: "<> show defEvalCutoff))
<*> option auto (short 'r' <> long "restarts" <> metavar "N" <> value defRestarts
<> help ("number of restarts per fold/model (model with best training performance will be chosen), default: "<> show defRestarts))
<*> option auto (long "folds" <> metavar "K" <> value defFolds
<> help ("number of folds (cross-validation only), default: "<> show defFolds))
where ConvergenceDiagParams { convergenceThreshold=defThresh
, convergenceMaxIter=defIter
, convergenceDropInitIter=defDrop
, convergenceEvalCutoff=(EvalCutoffAt defEvalCutoff)
, convergenceRestarts=defRestarts
, convergenceFolds=defFolds
} = defaultConvergenceParams
featureParamsParser :: Parser FeatureParams
featureParamsParser = FeatureParams
<$> option str (long "feature-runs-directory" <> short 'd' <> help "directory containing run files for features" <> metavar "DIR")
<*> many (option str (long "feature" <> short 'f' <> help "feature name, needs to match filename in feature-runs-directory" <> metavar "FEATURE") )
<*> (some (option (parseFeatureVariant <$> str) (long "feature-variant" <> metavar "FVAR"
<> help ("Enable feature variant (default all), choices: " ++(show [minBound @FeatureVariant .. maxBound]) )))
<|> pure [minBound @FeatureVariant .. maxBound]
)
<*> flag True False (long "jsonl-run" <> help "Load data from jsonl file instead of trec_eval run file")
defaultFeatureParamsParser :: Parser DefaultFeatureParams
defaultFeatureParamsParser =
( DefaultFeatureSingleValue
<$> option auto (long "default-any-feature-value" <> metavar "VALUE" <> value 0.0
<> help "When any feature is missing for a query/doc pair, this value will be used as feature value (default: 0.0)." )
) <|>
( DefaultFeatureVariantValue
<$> many ( option ( parseFeatureVariantPair =<< str ) (long "default-feature-variant-value" <> metavar "FVariant=VALUE"
<> help "default values for each feature variant in FVariant=VALUE format without spaces, example: --default-feature-variant-value FeatureScore=-9999.999"))
) <|>
( DefaultFeatureValue
<$> many ( option ( parseFeaturePair =<< str ) (long "default-feature-value" <> metavar "FNAME-FVariant=VALUE"
<> help "default values for each feature in FNAME-FVariant=VALUE format without spaces, example: --default-feature-value FeatureA-FeatureScore=-9999.999"))
)
where parseFeatureVariantPair :: String -> ReadM (FeatureVariant, Double)
parseFeatureVariantPair str =
case Split.splitOn "=" str of
[fv, val] -> return $ (parseFeatureVariant fv, read val)
_ -> fail $ "Ill-formed FVariant=VALUE format (expecting exactly one '='), got: "<> str
parseFeaturePair :: String -> ReadM (FeatName, Double)
parseFeaturePair str =
case Split.splitOn "=" str of
[fname, val] -> do
fname' <- parseFeatName (T.pack fname)
return (fname', read val)
_ -> fail $ "Ill-formed FNAME-FVariant=VALUE format (expecting exactly one '='), got: "<> str
gitMsg :: String
gitMsg = concat [ "[git ", $(gitBranch), "@", $(gitHash)
, " (", $(gitCommitDate), ")"
, "] " ]
getRankLipsVersion :: String
getRankLipsVersion = "Rank-lips version 1.2"
data ModelVersion = ModelVersionV10 | ModelVersionV11
deriving (Eq, Read, Show)
opts :: Parser (IO ())
opts = subparser
$ cmd "train" doTrain'
<> cmd "predict" doPredict'
<> cmd "convert-old-model" doConvertModel'
<> cmd "version" doPrintVersion
<> cmd "convert-features" doConvertFeatures'
where
cmd name action = command name (info (helper <*> action) fullDesc)
doPrintVersion =
f <$> optional (option str (short 'v'))
where
f :: Maybe String -> IO()
f _v = putStrLn $ unlines [ getRankLipsVersion
, "https://github.com/TREMA-UNH/rank-lips"
-- , gitMsg
]
doTrain' =
f <$> featureParamsParser
<*> option str (long "output-directory" <> short 'O' <> help "directory to write output to. (directories will be created)" <> metavar "OUTDIR")
<*> option str (long "output-prefix" <> short 'o' <> value "rank-lips" <> help "filename prefix for all written output; Default \"rank-lips\"" <> metavar "FILENAME")
<*> option str (long "qrels" <> short 'q' <> help "qrels file used for training" <> metavar "QRELS" )
<*> option str (long "experiment" <> short 'e' <> help "experiment name (will be archived in the model file)" <> metavar "FRIENDLY_NAME" )
<*> (minibatchParser <|> pure defaultMiniBatchParams)
<*> (flag False True ( long "train-cv" <> help "Also train with 5-fold cross validation"))
<*> (flag False True ( long "z-score" <> help "Z-score normalize features"))
<*> (flag False True ( long "save-heldout-queries-in-model" <> help "Save heldout query ids in model file (cross-validation only)"))
<*> convergenceParamParser
<*> defaultFeatureParamsParser
<*> option auto (short 'j' <> long "threads" <> help "enable multi-threading with J threads" <> metavar "J" <> value 1)
where
f :: FeatureParams -> FilePath -> FilePath -> FilePath -> String -> MiniBatchParams -> Bool -> Bool -> Bool -> ConvergenceDiagParams-> DefaultFeatureParams -> Int -> IO()
f fparams@FeatureParams{..} outputDir outputPrefix qrelFile experimentName miniBatchParams includeCv useZscore saveHeldoutQueriesInModel convergenceParams defaultFeatureParams numThreads = do
setNumCapabilities numThreads
dirFeatureFiles <- listDirectory featureRunsDirectory
createDirectoryIfMissing True outputDir
let features' = case features of
[] -> dirFeatureFiles
fs -> fs
outputFilePrefix = outputDir </> outputPrefix
doTrain id id (fparams{features=features'}) outputFilePrefix experimentName qrelFile miniBatchParams includeCv useZscore saveHeldoutQueriesInModel convergenceParams (Just defaultFeatureParams) getRankLipsVersion
doPredict' =
f <$> featureParamsParser
<*> option str (long "output-directory" <> short 'O' <> help "directory to write output to. (directories will be created)" <> metavar "OUTDIR")
<*> option str (long "output-prefix" <> short 'o' <> value "rank-lips" <> help "filename prefix for all written output; Default \"rank-lips\"" <> metavar "FILENAME")
<*> optional (option str (long "qrels" <> short 'q' <> help "qrels file, if provided, test MAP scores will be reported" <> metavar "QRELS" ))
<*> option str (long "model" <> short 'm' <> help "file where model parameters will be read from " <> metavar "FILE" )
<*> flag ModelVersionV11 ModelVersionV10 (long "is-v10-model" <> help "for loading V1.0 rank-lips models")
where
f :: FeatureParams -> FilePath -> FilePath -> Maybe FilePath -> FilePath -> ModelVersion -> IO()
f fparams@FeatureParams{..} outputDir outputPrefix qrelFileOpt modelFile modelVersion = do
let backwardsCompatibleModelLoader =
case modelVersion of
ModelVersionV11 -> loadRankLipsModel modelFile
ModelVersionV10 -> loadRankLipsV10Model modelFile
(SomeRankLipsModel (lipsModel :: RankLipsModel f ph)) <- deserializeRankLipsModel <$> backwardsCompatibleModelLoader
let model :: Model Feat ph
model = trainedModel lipsModel
fspace = modelFeatures model
modelFeatureFiles = F.featureNames fspace
dirFiles <- listDirectory featureRunsDirectory
let dirFeatureSet = convertFeatureNames [minBound @FeatureVariant .. maxBound @FeatureVariant] dirFiles
modelFeatureSet :: S.Set Feat
modelFeatureSet = (S.fromList modelFeatureFiles)
missingFeatures = modelFeatureSet `S.difference` dirFeatureSet
when (not $ S.null $ missingFeatures)
$ fail $ "Missing files for features (which are defined in model file): "
++ show missingFeatures
createDirectoryIfMissing True outputDir
let revertedModelFeatureFiles = nub $ mapMaybe extractFeatFiles modelFeatureFiles
outputFilePrefix = outputDir </> outputPrefix
doPredict id id (fparams{features = revertedModelFeatureFiles }) outputFilePrefix (defaultFeatureParams lipsModel) model qrelFileOpt
where
extractFeatFiles :: Feat -> Maybe FilePath
extractFeatFiles (Feat FeatNameInputRun{..}) = Just fRunFile
extractFeatFiles _ = Nothing
doConvertModel' =
convertOldModel
<$> argument str (metavar "FILE" <> help "old model file")
<*> option str (long "output" <> short 'o' <> metavar "OUT" <> help "file where new model will be written to")
doConvertFeatures' =
doConvertFeatures
<$> option str (short 'd' <> long "input-dir" <> metavar "IN-DIR" <> help "directory of old feature files")
<*> many (argument str ( metavar "FILE" <> help "filenames of feature files"))
<*> option str (long "output" <> short 'o' <> metavar "OUT-DIR" <> help "directory where new features will be written to")
<*> (flag' OldRunToJsonLMode (long "old-run-features-to-jsonl")
<|> flag' JsonLRunToTrecEval (long "jsonl-run-to-trec-eval")
)
data ConvertFeatureMode = OldRunToJsonLMode | JsonLRunToTrecEval
deriving (Eq, Show)
convertOldModel :: FilePath -> FilePath -> IO()
convertOldModel oldModelFile newModelFile = do
(SomeRankLipsModel (convertedLipsModel :: RankLipsModel f ph)) <- deserializeRankLipsModel <$> loadRankLipsV10Model oldModelFile
let serializedRankLipsModel = serializeRankLipsModel
$ convertedLipsModel {rankLipsVersion = Just $ getRankLipsVersion
<> ". (Converted from V1.0 model \'"
<> oldModelFile
<> "\')"
}
BSL.writeFile newModelFile $ Aeson.encode $ serializedRankLipsModel
doConvertFeatures :: FilePath -> [FilePath] -> FilePath -> ConvertFeatureMode-> IO()
doConvertFeatures oldDir filenames newDir OldRunToJsonLMode = do
runs <- loadRunFiles id id oldDir filenames
forM_ runs (\(fname, content) ->
writeJsonLRunFile (newDir</>fname) content
)
doConvertFeatures oldDir filenames newDir JsonLRunToTrecEval = do
runs <- loadJsonLRunFiles oldDir filenames
forM_ runs (\(fname, content) ->
SimplirRun.writeRunFile (newDir</>fname) content
)
main :: IO ()
main = join $ execParser $ info (helper <*> opts) (progDescDoc (Just desc) <> fullDesc)
where
desc = Pretty.vcat $ Pretty.punctuate Pretty.linebreak
[ para [ "Rank-lips is a high-performance multi-threaded list-wise learning-to-rank implementation that supports mini-batched learning." ]
, para [ "Rank-lips is designed to work with trec_eval file formats for defining runs (run format) and relevance data (qrel format)."
, "The features will be taken from the score and/or reciprocal rank of each input file. The filename of an input run"
, "(in the directory) will be used as a feature name. If you want to train a model and predict on a different test set"
, "make sure that the input runs for test features are using exactly the sane filename. We recommend to create"
, "different directories for training and test sets." ]
, para [ "For more information on invidiual commands call:" ]
, Pretty.indent 4 "rank-lips COMMAND -h"
, para [ " Also see website: http://www.cs.unh.edu/~dietz/rank-lips/" ]
]
para = Pretty.fillSep . map Pretty.text . foldMap words