From 7cc7d57d9e1d0c183c92d7cd7e42c950ed884da4 Mon Sep 17 00:00:00 2001 From: andrewsommerlot Date: Fri, 16 Jun 2017 14:49:16 -0400 Subject: [PATCH] fix function name --- .Rproj.user/8F0F1B72/pcs/source-pane.pper | 2 +- .../8F0F1B72/pcs/windowlayoutstate.pper | 12 +++++------ .Rproj.user/8F0F1B72/pcs/workbench-pane.pper | 2 +- .Rproj.user/8F0F1B72/persistent-state | 4 ++-- .Rproj.user/8F0F1B72/sdb/prop/INDEX | 3 +++ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/201230D9 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/2DF3C051 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/3118EB1D | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/31B8BC80 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/37AD3D49 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44C1FDFA | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44DC5155 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/48451B7A | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/5866E996 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/6A5373C7 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/80BD2784 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/843B53AD | 21 ------------------- .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/8A952F6E | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/910ADB2B | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/98EBD2C8 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9C72EE29 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9DC15287 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A2728057 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A37A2442 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AA01AE68 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AF6B5159 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/B5A6170B | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/C2483659 | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/D29C1B6D | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EF25A0ED | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EFB1B61A | 20 ------------------ .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/lock_file | 0 .gitignore | 4 ++++ DESCRIPTION | 1 + NAMESPACE | 2 +- R/test-metric.R | 4 ++-- man/startml.Rd | 11 ---------- man/{test_metric_h2o.Rd => test_metric.Rd} | 8 +++---- 38 files changed, 25 insertions(+), 549 deletions(-) delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/201230D9 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/2DF3C051 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/3118EB1D delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/31B8BC80 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/37AD3D49 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44C1FDFA delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44DC5155 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/48451B7A delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/5866E996 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/6A5373C7 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/80BD2784 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/843B53AD delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/8A952F6E delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/910ADB2B delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/98EBD2C8 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9C72EE29 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9DC15287 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A2728057 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A37A2442 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AA01AE68 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AF6B5159 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/B5A6170B delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/C2483659 delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/D29C1B6D delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EF25A0ED delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EFB1B61A delete mode 100644 .Rproj.user/8F0F1B72/sdb/s-2D52ACD6/lock_file create mode 100644 .gitignore rename man/{test_metric_h2o.Rd => test_metric.Rd} (85%) diff --git a/.Rproj.user/8F0F1B72/pcs/source-pane.pper b/.Rproj.user/8F0F1B72/pcs/source-pane.pper index 8eebf76..3249574 100644 --- a/.Rproj.user/8F0F1B72/pcs/source-pane.pper +++ b/.Rproj.user/8F0F1B72/pcs/source-pane.pper @@ -1,3 +1,3 @@ { - "activeTab" : 24 + "activeTab" : 2 } \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/pcs/windowlayoutstate.pper b/.Rproj.user/8F0F1B72/pcs/windowlayoutstate.pper index 9994e46..40e70f7 100644 --- a/.Rproj.user/8F0F1B72/pcs/windowlayoutstate.pper +++ b/.Rproj.user/8F0F1B72/pcs/windowlayoutstate.pper @@ -1,14 +1,14 @@ { "left" : { - "panelheight" : 601, - "splitterpos" : 257, + "panelheight" : 795, + "splitterpos" : 339, "topwindowstate" : "NORMAL", - "windowheight" : 639 + "windowheight" : 833 }, "right" : { - "panelheight" : 601, - "splitterpos" : 342, + "panelheight" : 795, + "splitterpos" : 452, "topwindowstate" : "NORMAL", - "windowheight" : 639 + "windowheight" : 833 } } \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/pcs/workbench-pane.pper b/.Rproj.user/8F0F1B72/pcs/workbench-pane.pper index 7c24819..89485f6 100644 --- a/.Rproj.user/8F0F1B72/pcs/workbench-pane.pper +++ b/.Rproj.user/8F0F1B72/pcs/workbench-pane.pper @@ -1,5 +1,5 @@ { - "TabSet1" : 0, + "TabSet1" : 2, "TabSet2" : 2, "TabZoom" : { } diff --git a/.Rproj.user/8F0F1B72/persistent-state b/.Rproj.user/8F0F1B72/persistent-state index ce8a37b..b730923 100644 --- a/.Rproj.user/8F0F1B72/persistent-state +++ b/.Rproj.user/8F0F1B72/persistent-state @@ -1,6 +1,6 @@ build-last-errors="[]" -build-last-errors-base-dir="C:/Users/Andy/Desktop/r-package/startml/" -build-last-outputs="[{\"output\":\"==> devtools::document(roclets=c('rd', 'collate', 'namespace'))\\n\\n\",\"type\":0},{\"output\":\"Updating startml documentation\\r\\n\",\"type\":2},{\"output\":\"Loading startml\\r\\n\",\"type\":2},{\"output\":\"Writing startml.Rd\\r\\n\",\"type\":1},{\"output\":\"Documentation completed\\n\\n\",\"type\":1},{\"output\":\"==> Rcmd.exe INSTALL --no-multiarch --with-keep.source startml\\n\\n\",\"type\":0},{\"output\":\"* installing to library 'C:/Users/Andy/Documents/R/win-library/3.4'\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* installing *source* package 'startml' ...\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** R\\r\\n\",\"type\":1},{\"output\":\"** preparing package for lazy loading\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** help\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\r\\n\",\"type\":1},{\"output\":\"** building package indices\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package can be loaded\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* DONE (startml)\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]" +build-last-errors-base-dir="C:/Users/Andy/Desktop/auto/startml/" +build-last-outputs="[{\"output\":\"==> devtools::document(roclets=c('rd', 'collate', 'namespace'))\\n\\n\",\"type\":0},{\"output\":\"Updating startml documentation\\r\\n\",\"type\":2},{\"output\":\"Loading startml\\r\\n\",\"type\":2},{\"output\":\"Documentation completed\\n\\n\",\"type\":1},{\"output\":\"==> Rcmd.exe INSTALL --no-multiarch --with-keep.source startml\\n\\n\",\"type\":0},{\"output\":\"* installing to library 'C:/Users/Andy/Documents/R/win-library/3.4'\\r\\n\",\"type\":1},{\"output\":\"* installing *source* package 'startml' ...\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** R\\r\\n\",\"type\":1},{\"output\":\"** preparing package for lazy loading\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** help\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** building package indices\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package can be loaded\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* DONE (startml)\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]" compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}" console_procs="[]" files.monitored-path="" diff --git a/.Rproj.user/8F0F1B72/sdb/prop/INDEX b/.Rproj.user/8F0F1B72/sdb/prop/INDEX index 438130a..ce9505b 100644 --- a/.Rproj.user/8F0F1B72/sdb/prop/INDEX +++ b/.Rproj.user/8F0F1B72/sdb/prop/INDEX @@ -1,5 +1,8 @@ C%3A%2FUsers%2FAndy%2FDesktop%2Fauto%2Fstart_setup.R="740FC4E5" +C%3A%2FUsers%2FAndy%2FDesktop%2Fauto%2Fstartml%2FDESCRIPTION="AEB39D1D" +C%3A%2FUsers%2FAndy%2FDesktop%2Fauto%2Fstartml%2FNAMESPACE="B9312ADA" C%3A%2FUsers%2FAndy%2FDesktop%2Fauto%2Fstartml%2FR%2Fstartml.R="EBF1AC1E" +C%3A%2FUsers%2FAndy%2FDesktop%2Fauto%2Fstartml%2FR%2Ftest-metric.R="E7F383DA" C%3A%2FUsers%2FAndy%2FDesktop%2Fr-package%2Fstartml%2FDESCRIPTION="EB2EF4CE" C%3A%2FUsers%2FAndy%2FDesktop%2Fr-package%2Fstartml%2FNAMESPACE="81F7EBEA" C%3A%2FUsers%2FAndy%2FDesktop%2Fr-package%2Fstartml%2FR%2Fautotrain.R="BF938F44" diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/201230D9 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/201230D9 deleted file mode 100644 index 6d070cd..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/201230D9 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' autotrain\n#'\n#' Autotrain implements H2O grid search to automatically build machine learning\n#' models\n#'\n#' @param train H2O frame object containing labeled data for model training.\n#' No Default.\n#' @param valid H2O frame object containing labeled data for model validation.\n#' No Default.\n#' @param y Character object of length 1 identifying the column name of the target variable. No Default.\n#' @param x Character object of length 1 or more identifying the column name(s) of the input variables. No Default.\n#' @param algorithms Character object of length 3, 2, or 1, specifying which alrogrithms to automatically train. The autotrain function will run a separate grid search for each algorimth type. Choices are: \"deeplearning\", \"randomForest\", and \"gbm\" following the naming convention in H2O version 3. Defaults to c(\"deeplearning\", \"randomForest\", \"gbm\").\n#' @param eval_metric Character object defining evaluation metric for training. Defualt is \"AUTO\" and uses built-in H2O automatic choice for target data type.\n#' @param validation_type Defines validation type for training models. Defaults to \"shared_holdout\" indicating all model built with all algorithms share the same validation set. Currently, this is the only option in autotrain. Planned types include \"random_holdout\" where each model will get a unique randomized sample of labeled data for validation, and \"xval\" in which the cross validation functionality in H2O will be implemented in every model.\n#' @param runtime_secs Character Object which sets the length of time each grid search will run. Defaults to 20, thus the default runtime is 20 sec * (length of algorimths) = 1 minute.\n# @keywords gradient boosting, deep learning, random forest, gird serach optimization, automatic, training\n#' @param wd Character object defining file path where resulting modeling will be saved. Defualts to current working directory.\n#' @return List object containing H2O model objects\n#' @export\nautotrain <- function(train,\n valid,\n y,\n x,\n algorithms = c(\"deeplearning\", \"randomForest\", \"gbm\"),\n eval_metric = \"AUTO\",\n validation_type = \"SharedHoldout\", # add RandomHoldout and cv\n runtime_secs = 10,\n wd = getwd()) {\n\n model_paths <- NULL\n\n if(sum(as.numeric(algorithms %in% \"deeplearning\")) == 1) {\n dl_autogrid(train = train,\n valid = valid,\n y = y,\n x = x,\n eval_metric = eval_metric,\n deeplearning_runtime_secs = runtime_secs)\n model_paths <- c(model_paths, paste(wd, \"/dl_models\", sep = \"\"))\n }\n if(sum(as.numeric(algorithms %in% \"randomForest\")) == 1) {\n rf_autogrid(train = train,\n valid = valid,\n y = y,\n x = x,\n eval_metric = eval_metric,\n rf_runtime_secs = runtime_secs)\n model_paths <- c(model_paths, paste(wd, \"/rf_models\", sep = \"\"))\n }\n if(sum(as.numeric(algorithms %in% \"gbm\")) == 1) {\n gbm_autogrid(train = train,\n valid = valid,\n y = y,\n x = x,\n eval_metric = eval_metric,\n gbm_runtime_secs = runtime_secs)\n model_paths <- c(model_paths, paste(wd, \"/gbm_models\", sep = \"\"))\n }\n if(sum(as.numeric(algorithms %in% \"gbm\") + as.numeric(algorithms %in% \"randomForest\") +\n as.numeric(algorithms %in% \"deeplearning\") == 0)) {\n stop(\"Set algorithms to one or a combination of 'deeplearning', 'randomForest', 'gbm'\")\n }\n\n all_models <- load_models(model_paths)\n all_models\n}\n", - "created" : 1496000532147.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "3785166776", - "id" : "201230D9", - "lastKnownWriteTime" : 1495997698, - "last_content_update" : 1495997698, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/autotrain.R", - "project_path" : "R/autotrain.R", - "properties" : { - }, - "relative_order" : 9, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/2DF3C051 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/2DF3C051 deleted file mode 100644 index 904f83b..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/2DF3C051 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' predict_blob\n#'\n#' Uses all selected models stored in mlblob object to make predictions\n#'\n#' @param test H2O frame object used as new data input.\n#' @param selected_models List of H2O models to predict on new data.\n#' @return List of predictions same as length selected_models\n#' @export\npredict_blob <- function(test, selected_models) {\n #cat(\"Predicting on New Data With Selected Models\\n\")\n predictions <- lapply(selected_models, h2o.predict, newdata = test)\n predictions\n}\n", - "created" : 1496010280579.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2058516111", - "id" : "2DF3C051", - "lastKnownWriteTime" : 1496013486, - "last_content_update" : 1496013486047, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/predict-blob.R", - "project_path" : "R/predict-blob.R", - "properties" : { - }, - "relative_order" : 17, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/3118EB1D b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/3118EB1D deleted file mode 100644 index 1412bec..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/3118EB1D +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' dl_autogrid\n#'\n#' dl_autogrid is a wrapper employing built-in settings to run grid search hyper parameter optimizations on deep learning (dl_ algorithms)\n#'\n#' @param train H2O frame object containing labeled data for model training.\n#' No Default.\n#' @param valid H2O frame object containing labeled data for model validation.\n#' No Default.\n#' @param y Character object of length 1 identifying the column name of the target variable. No Default.\n#' @param x Character object of length 1 or more identifying the column name(s) of the input variables. No Default.\n#' @param folds Character object defining number of folds for xval. Default is NULL and currently is not implemented.\n#' @param deeplearning_runtime_secs Numeric object defining total number of seconds the hyper parameter grid search will run.\n#' @param deeplearning_stopping_rounds Numeric object defining maximum number of training rounds an individual deep learning model not improving will continue to run. Default is 10.\n#' @param deeplearning_stopping_tolerance Numeric object which sets the mimmum loss funciton improvement for a training iteration to be considered an improvement. Defulat is 1E-5.\n#' @param deeplearning_adaptive_rate Boolean, if TRUE ADELTA is used to control learning rate if FALSE than normal rate controls can be used.\n#' @param grid_strategy Character object default and only current supported option is \"randomDiscrete\"\n#' @param eval_metric Character object defining evaluation metric for training. Defualt is \"AUTO\" and uses built-in H2O automatic choice for target data type.\n#' @param wd Character object defining file path where dl_models folder will be created and deep learning models saved. Defaults to current working directory.\n#' @return List object containing H2O model objects. Additionally saves h2o models as re-loadable text files in wd/dl_models folder.\n#' @export\ndl_autogrid <- function(train,\n valid,\n y,\n x,\n eval_metric = \"AUTO\",\n wd = getwd(),\n folds = NULL,\n deeplearning_runtime_secs = 10,\n deeplearning_stopping_rounds = 10,\n deeplearning_stopping_tolerance = 1e-5,\n deeplearning_adaptive_rate = TRUE,\n grid_strategy = \"RandomDiscrete\") {\n\n cat(\"Training Deep Learning Models\\n\")\n #==============================================\n dl_parameter_search <- list(rate= c(1e-9, 1e-8, 1e-7, 1e-6),\n rate_annealing= c(1e-12, 1e-9, 1e-6),\n momentum_start= c(0.8, 0.9),\n momentum_stable= c(0.95, 0.99),\n momentum_ramp= 1/seq(1e-12, 1e-9, 1e-6),\n score_duty_cycle= c(0.02, 0.05, 0.1),\n activation = c(\"RectifierWithDropout\",\n \"TanhWithDropout\",\n \"MaxoutWithDropout\"),\n hidden = list(c(200,200,200),\n c(512,512,512),\n c(32,32,32),\n c(64, 64, 64)),\n input_dropout_ratio = c(0, 0.05, 0.1),\n hidden_dropout_ratios = list(c(0, 0, 0),\n c(0.1, 0.1, 0.1),\n c(0.2, 0.2, 0.2),\n c(0.5, 0.5, 0.5)),\n l1= c(1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 0),\n l2= c(1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 0),\n max_w2= c(10, 20, 40),\n epsilon = c(1e-10, 1e-8, 1e-6, 1e-4),\n rho = c(0.97, 0.98, 0.98))\n\n #========================================================\n # set variable type for proper auto options\n if(deeplearning_adaptive_rate == TRUE) {\n hyper_params <- dl_parameter_search[seq(7,15)]\n }\n if(deeplearning_adaptive_rate == FALSE) {\n hyper_params <- dl_parameter_search[seq(1:13)]\n }\n\n dl_search_criteria = list(strategy = grid_strategy,\n max_runtime_secs = deeplearning_runtime_secs,\n stopping_rounds = deeplearning_stopping_rounds,\n stopping_tolerance = deeplearning_stopping_tolerance,\n seed = 1234) # needs to be changable\n\n # run the grid\n # needs be removed first for iterating within same session\n h2o.rm(\"dl\")\n dl_random_grid <- h2o.grid(algorithm=\"deeplearning\",\n grid_id = \"dl\", # makes repeat run impossible\n training_frame=train,\n validation_frame = valid,\n x = x,\n y = y,\n standardize = TRUE,\n epochs=1000, #needs to change\n overwrite_with_best_model = TRUE,\n adaptive_rate = deeplearning_adaptive_rate,\n hyper_params = dl_parameter_search,\n search_criteria = dl_search_criteria,\n stopping_metric = eval_metric,\n seed = 1234) # needs to be changable\n #====================================================\n #dl_grid <- h2o.getGrid(\"dl\")\n\n # write out the models to disk\n dl_path <- paste(wd, \"/dl_models\", sep = \"\")\n dl_model_files <- sapply(dl_random_grid@model_ids, function(m) h2o.saveModel(h2o.getModel(m), path = dl_path, force = TRUE))\n\n # print out alert\n cat(paste(\"Deep Learning Models Saved To:\\n\", dl_path, \"\\n\\n\"))\n dl_random_grid\n}\n", - "created" : 1495999358277.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "134855136", - "id" : "3118EB1D", - "lastKnownWriteTime" : 1496011093, - "last_content_update" : 1496011093978, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/dl-autogrid.R", - "project_path" : "R/dl-autogrid.R", - "properties" : { - }, - "relative_order" : 6, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/31B8BC80 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/31B8BC80 deleted file mode 100644 index b29fd40..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/31B8BC80 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' even_lengths\n#'\n#' Small, internal function needed for plotting. Not intended for direct use.\n#'\n#' @param train_rmse List object contiaing model performance numbers. No Default.\n#' @export\neven_lengths <- function(train_rmse) {\n max_length <- max(unlist(lapply(train_rmse, length)))\n train_hist <- lapply(train_rmse, paste_nas, longest = max_length)\n train_hist\n}\n", - "created" : 1496013374105.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "1139479894", - "id" : "31B8BC80", - "lastKnownWriteTime" : 1496000329, - "last_content_update" : 1496000329, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/even-lengths.R", - "project_path" : "R/even-lengths.R", - "properties" : { - }, - "relative_order" : 23, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/37AD3D49 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/37AD3D49 deleted file mode 100644 index 7ec172c..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/37AD3D49 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' plot\n#'\n#' Masked from graphics. Summary plot of mlblob object.\n#'\n#' @param mlout mlblob object from output of startml function.\n#' @return None. Plots graphic to device.\n#' @export\nplot <- function(mlout) { suppressWarnings(\n if(class(mlout)[1] == \"mlblob\") {\n if(class(mlout@models[[1]]) == \"H2OBinomialModel\") {\n stop(\"Does not yet support binomial model summary\")\n } else if(class(mlout@models[[1]]) == \"H2ORegressionModel\") {\n train_rmse <- lapply(mlout@models, get_hist)\n longest <- max(unlist(lapply(train_rmse, length)))\n train_hist <- even_lengths(train_rmse)\n hist_df <- as.data.frame(do.call('cbind', train_hist))\n ids <- sapply(mlout@models, get_ids)\n ids_split <- sapply(names(ids), strsplit, split = \"/\")\n ids_final <- sapply(ids_split, `[`, length(ids_split[[1]]))\n iter <- seq(0, longest - 1, by = 1)\n colnames(hist_df) <- ids_final\n hist_df$iteration <- iter\n hist_melted <- melt(hist_df, ncol(hist_df))\n hist_melted <- hist_melted[-which(is.na(hist_melted$value)),]\n ids_final <- as.list(ids_final)\n hist_lab <- lapply(ids_final, hist_text, hist_melted = hist_melted)\n hist_all <- do.call('rbind', hist_lab)\n p_history <- ggplot(hist_all) +\n geom_line(aes(x = iteration, y = value, color = variable),\n alpha = 0.5, size = 1.2) +\n geom_text(aes(label = variable, x = lab_x, colour = variable,\n y = lab_y, hjust = \"inward\"),\n alpha = 1, check_overlap = TRUE, size = 3) +\n guides(color = FALSE) +\n ggtitle(\"Training History of Models on Valid\") +\n ylab(\"RMSE\") +\n xlab(\"Iterations\")\n # make the histograms\n y = mlout@y\n all_target <- as.data.frame(mlout@labeled_data[[1]][,y])[,1]\n max_length <- length(all_target)\n train_target <- c(as.data.frame(mlout@train[[1]][,y])[,1],\n rep(NA, max_length - nrow(mlout@train[[1]][,y])))\n valid_target <- c(as.data.frame(mlout@valid[[1]][,y])[,1],\n rep(NA, max_length - nrow(mlout@valid[[1]][,y])))\n test_target <- c(as.data.frame(mlout@test[[1]][,y])[,1],\n rep(NA, max_length - nrow(mlout@test[[1]][,y])))\n target_df <- data.frame(all = all_target,\n train = train_target,\n valid = valid_target,\n test = test_target)\n\n # now melt\n target_melted <- melt(target_df)\n # now plot\n p_target <- ggplot(target_melted) +\n geom_histogram(aes(x = value, y = ..density..), bins = 20) +\n geom_vline(data = ddply(target_melted , \"variable\",\n summarize, wavg = mean(na.omit(value))),\n aes(xintercept=wavg, color = \"green\")) +\n geom_vline(data = ddply(target_melted , \"variable\", summarize,\n wavg = median(na.omit(value))),\n aes(xintercept=wavg, color = \"orange\")) +\n geom_density(aes(value, color = \"blue\"), alpha = 0.8) +\n facet_wrap(~variable) +\n scale_color_manual(name = '', values = c(\"green\" = \"green\",\n \"orange\" = \"orange\",\n \"blue\" = \"blue\"),\n labels = c(\"Kernel\", 'Mean','Median')) +\n xlab(y) +\n ylab(\"Density\") +\n ggtitle(paste(y, \"in Labeled Data Splits\")) +\n theme(axis.text.x=element_text(angle = -45, hjust = 0))\n # make the xy plot ======================\n for(i in 1:length(mlout@predict_test)) {\n names(mlout@predict_test[[i]]) <- ids_final[[i]]\n }\n xy_df <- do.call(h2o.cbind, mlout@predict_test)\n xy_df$labeled <- mlout@test[[1]][,y]\n xy_melted <- melt(as.data.frame(xy_df), ncol(xy_df))\n p_xy <- ggplot(xy_melted) +\n geom_point(aes(x = labeled, y = value, color = variable), alpha = 0.5) +\n geom_point(aes(x = labeled, y = labeled), color = \"black\", alpha = 0.5) +\n guides(color = FALSE) +\n xlab(paste(\"Labeled\", y)) +\n ylab(paste(\"Predicted\", y)) +\n ggtitle(\"Labels vs Predictions on Test\")\n # the order plot ======================\n pred_melted <- melt(as.data.frame(xy_df), ncol(xy_df))\n p_order <- ggplot(pred_melted[order(pred_melted$labeled),]) +\n geom_point(aes(x = seq(1, nrow(pred_melted)), y = value,\n color = variable), alpha = 0.6) +\n geom_point(aes(x = seq(1, nrow(pred_melted)), y = labeled),\n size = .8) +\n scale_color_discrete(guide=FALSE) +\n ylab(y) +\n xlab(paste(\"Index: Ordered By Asending\", y)) +\n ggtitle(\"Labels and Predictions on Test\")\n # Plot everything on the grid\n grid.arrange(p_history, p_order, p_target, p_xy, ncol = 2, nrow = 2)\n }\n } else {\n plot(mlout)\n }\n)}\n", - "created" : 1496010164534.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "1746454099", - "id" : "37AD3D49", - "lastKnownWriteTime" : 1496013471, - "last_content_update" : 1496013471572, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/plot.R", - "project_path" : "R/plot.R", - "properties" : { - }, - "relative_order" : 18, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44C1FDFA b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44C1FDFA deleted file mode 100644 index 7aba222..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44C1FDFA +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' select_models\n#'\n#' Caculate performance metrics from models on new data. Depends on Metrics package.\n#' @param prediction_list List object of H2O frames containing predictions.\n#' No Default.\n#' @param test H2O frame object containing labeled data for model evaluation.\n#' No Default.\n#' @param eval_metric Character object one of logloss, MSE, RMSE, MAE, AUC, or mean_per_class_error.\n#' @param y Character object of length 1 identifying the column name of the target variable. No Default.\n#' @return List object same length as prediction_list containing performance of each model on test input with selected metric.\n#' @export\n# get test holdout metrics from models # depends on Metrics package for now\ntest_metric_h2o <- function(prediction_list, test, eval_metric, y) {\n if(eval_metric == \"AUC\" | eval_metric == \"logloss\") {\n predictions <- lapply(prediction_list, function(x)x[,3])\n } else {\n predictions <- lapply(prediction_list, function(x)x[,1])\n }\n actual <- test[,y]\n if(eval_metric == \"logloss\") {\n metric <- lapply(predictions, FUN = logLoss, actual = actual)\n } else if(eval_metric == \"MSE\") {\n metric <- lapply(predictions, FUN = mse, actual = actual)\n } else if(eval_metric == \"RMSE\") {\n metric <- lapply(predictions, FUN = rmse, actual = actual)\n } else if(eval_metric == \"MAE\") {\n metric <- lapply(predictions, FUN = mae, actual = actual)\n } else if(eval_metric == \"AUC\") {\n metric <- lapply(predictions, FUN = auc, actual = actual)\n } else if(eval_metric == \"mean_per_class_error\") {\n metric <- lapply(predictions, FUN = ce, actual = actual)\n }else if(eval_metric == \"RMSLE\") {\n metric <- lapply(predictions, FUN = rmsle, actual = actual)\n } else {\n stop(\"Choose an eval metric: logloss, MSE, RMSE, MAE, AUC, mean_per_class_error\")\n }\n metric\n}\n", - "created" : 1496011524565.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "3153945181", - "id" : "44C1FDFA", - "lastKnownWriteTime" : 1496012510, - "last_content_update" : 1496012510995, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/test-metric.R", - "project_path" : "R/test-metric.R", - "properties" : { - }, - "relative_order" : 20, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44DC5155 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44DC5155 deleted file mode 100644 index fdc68b0..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/44DC5155 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#=========================================================\n# first try at full R package..\n#============================================================\n\nlibrary(devtools)\ninstall_github(\"andrewsommerlot/startml\")\n\nlibrary(startml)\n\nh2o.removeAll()\nh2o.shutdown(prompt = FALSE)\nh2o.init(nthreads=6, max_mem_size=\"12G\")\n\nwd <- \"C:/Users/Andy/Desktop/auto\"\nsetwd(wd)\n\ntrain_file <- \"train.csv\"\ntest_file <- \"test.csv\"\n\n\nload_data <- function(train_file) {\n df1 <- h2o.importFile(path = normalizePath(train_file))\n df1\n}\n\n# test\ndf1 <- load_data(train_file)\nnew_data <- load_data(test_file)\n\n# define the inputs\ny = \"SalePrice\"\nx <- setdiff(names(df1), y)[-1]\nid <- names(df1)[1]\n# test on regression data\n#library(methods)\n\n# run the ml file on binary classification.\nmlout <- startml(labeled_data = df1,\n newdata = new_data,\n x = x,\n label_id = id,\n y = \"SalePrice\",\n y_type = \"continuous\",\n eval_metric = \"RMSE\",\n eval_threshold = NULL,\n validation_type = \"shared_holdout\",\n algorithms = c(\"gbm\", \"deeplearning\", \"randomForest\"),\n percent_valid_holdout = 10,\n percent_test_holdout = 10,\n runtime_secs = 60,\n split_seed = 1234,\n number_top_models = NULL,\n correlation_threshold = NULL)\n\nplot(mlout)\n\n\n\n\n\n\n\n#============================================================\n## the prepare function\n# if shat breaks:\n#install.packages(\"h2o\", type=\"source\", repos=(c(\"http://h2o-release.s3.amazonaws.com/h2o/rel-tverberg/4/R\")))\n# load libraries\n#install.packages(\"h2o\")\nlibrary(h2o)\n#library(cvAUC)\ninstall.packages(c(\"Metrics\", \"ggplot2\", \"reshape2\"))\nlibrary(Metrics)\nlibrary(ggplot2)\nlibrary(reshape2)\nlibrary(plyr)\ninstall.packages(\"gridExtra\")\nlibrary(gridExtra)\nlibrary(plotly)\ninstall.packages(\"tsne\")\nlibrary(tsne)\ninstall.packages(\"plotly\")\nlibrary(plotly)\n\n\n#library(h2oEnsemble)\n\n\n## start decent size (more ram than needed for this)\nh2o.shutdown(prompt = FALSE)\nh2o.init(nthreads=6, max_mem_size=\"12G\")\n\nh2o.removeAll()\n\nload_files <- as.list(list.files(path = wd <- \"C:/Users/Andy/Desktop/auto/startml/R/\",\n pattern = \"*\", full.names = TRUE))\n#load_files <- load_files[-which(load_files %in% \"C:/Users/Andy/Desktop/auto/start.ml/start_setup.R\")]\n\ngetfiles <- lapply(load_files, source)\n\n# extra\nwd <- \"C:/Users/Andy/Desktop/auto\"\nsetwd(wd)\n\ntrain_file <- \"numerai_training_data.csv\"\ntest_file <- \"numerai_tournament_data.csv\"\ny_name <- \"target\"\ny_type <- \"discrete\" # or \"continous\"\n\n#============================================================\n# the load data function.\nstart.loaddata <- function(train_file) {\n df1 <- h2o.importFile(path = normalizePath(train_file))\n df1\n}\n\n# test need to rename all test things that are actually new data, new data\ndf1 <- start.loaddata(train_file)\nnewdata <- start.loaddata(test_file)\n\n#h2o.init(strict_version_check = FALSE)\n# run the ml file on binary classification.\nmlout <- startml(labeled_data = df1, newdata = newdata,\n y = \"target\", y_type = \"discrete\",\n algorithms = c(\"deeplearning\", \"randomForest\", \"gbm\"),\n eval_metric = \"AUC\",\n validation_type = \"shared_holdout\", # add RandomHoldout and cv\n percent_valid_holdout = 10,\n percent_test_holdout = 10,\n runtime_secs = 5,\n split_seed = 1234,\n number_top_models = NULL,\n eval_threshold = NULL,\n correlation_threshold = NULL,\n return_dataframe = FALSE)\n\nstart.plot(mlout)\n\n\n\ntest <- start.gbmgrid(train = df1,\n y_name = \"target\",\n y_type = \"discrete\",\n eval_metric = \"AUTO\",\n validation_type = \"SharedHoldout\",\n wd = getwd(),\n percent_valid_holdout = 10,\n percent_test_holdout = 10,\n folds = NULL,\n gbm_min_depth = 1,\n gbm_max_depth = 7,\n gbm_runtime_secs = 10,\n gbm_stopping_rounds = 10,\n gbm_stopping_tolerance = 1e-5,\n grid_strategy = \"RandomDiscrete\",\n split_seed = NULL)\n\n\n\n\n\n# for testing\n# run the ml file on binary classification.\nmodels <- start.autotrain(train = df1,\n y_name = \"target\",\n y_type = \"discrete\",\n eval_metric = \"logloss\",\n validation_type = \"SharedHoldout\",\n split_seed = 1234\n)\n\n\n#============================================================\n# the load data function.\n# now try a regression ......\n# test on kaggle housing prices ...\nh2o.removeAll()\n\nload_files <- as.list(list.files(path = \"C:/Users/Andy/Desktop/auto/startml/R\",\n pattern = \"*\", full.names = TRUE))\n#load_files <- load_files[-which(load_files %in% \"C:/Users/Andy/Desktop/auto/start.ml/R/start_setup.R\")]\n\ngetfiles <- lapply(load_files, source)\n\n# extra\nwd <- \"C:/Users/Andy/Desktop/auto\"\nsetwd(wd)\n\ntrain_file <- \"train.csv\"\ntest_file <- \"test.csv\"\n\n\nstart.loaddata <- function(train_file) {\n df1 <- h2o.importFile(path = normalizePath(train_file))\n df1\n}\n\n# test\ndf1 <- start.loaddata(train_file)\nnew_data <- start.loaddata(test_file)\ny = \"SalePrice\"\nx <- setdiff(names(df1), y)[-1]\nid <- names(df1)[1]\n# test on regression data\n# run the ml file on binary classification.\nmlout <- startml(labeled_data = df1,\n newdata = new_data,\n x = x,\n label_id = id,\n y = \"SalePrice\",\n y_type = \"continuous\",\n eval_metric = \"RMSE\",\n eval_threshold = NULL,\n validation_type = \"shared_holdout\",\n algorithms = c(\"deeplearning\"),\n percent_valid_holdout = 10,\n percent_test_holdout = 10,\n runtime_secs = 60*5,\n split_seed = 1234,\n number_top_models = NULL,\n correlation_threshold = 1\n\n)\n\nplot(mlout)\n\n\ngetfiles <- lapply(load_files, source)\n\nin_model <- mlout@models$`C:/Users/Andy/Desktop/auto/dl_models/dl_model_0`\n\n#model, vis_data, layer, label, dimentions = 2, max_points = 1000)\nplot_dlayer(model = in_model,\n vis_data = df1,\n layer = 1,\n label = \"LotArea\",\n dimentions = 3,\n max_points = 1400,\n tsne_iter = 200)\n\n\ntdat <- as.data.frame(df1)\n\nggplot(tdat) +\n geom_point(aes(x = Neighborhood, y = YearRemodAdd)) +\n coord_flip()\n\n\ntest <- data.frame(one = c(1,2,3), two = c(4,5,6), three = c(7,8,9), lab = as.factor(c(\"this\", \"that\", \"this\")),\n colors = c('#BF382A', '#0C4B8E'))\n p <- plot_ly(test,\n x = ~one,\n y = ~two,\n z = ~three,\n color = ~lab) %>%\n add_markers() %>%\n layout(scene = list(xaxis = list(title = \"tSNE Dim 1\"),\n yaxis = list(title = \"tSNE Dim 2\"),\n zaxis = list(title = \"tSNE Dim 3\")))\n\nhtmlwidgets::saveWidget(as_widget(p), \"dl-hidden-layer-plot.html\")\n\n\n\n# for now to get different metrics working\nmodel_list <- start.autotrain(train = df1,\n #test = test,\n y_name = \"SalePrice\",\n y_type = \"continuous\",\n eval_metric = \"MSE\",\n validation_type = \"SharedHoldout\",\n runtime_secs = 7200,\n split_seed = 1234\n)\n\n\nmetric <- start.validmetric(model_list, eval_metric = \"RMSLE\")\n\nsorted_models <- start.sortmodels(model_list, eval_metric = eval_metric)\n\nselected_models <- start.selectmodels(sorted_models, model_list, number_top_models = 5)\n\n\npredictions <- start.predict(test = test, selected_models)\n\n# test\n# make a data frame\nr_data <- lapply(predictions, as.data.frame)\npred_df <- do.call('cbind', r_data)\nbasic_bag <- rowMeans(pred_df)\nr_test <- as.data.frame(test)\nid <- r_test$Id\noutput <- data.frame(Id = id, SalePrice = basic_bag)\n\nwrite.csv(output, \"test_sub_2.csv\", row.names = FALSE, quote = FALSE)\n\n# view all models\n\nvalidations <- start.predict(train, selected_models)\nr_val <- lapply(validations, as.data.frame)\nvalid_df <- do.call('cbind', r_val)\n\n#!! some correlation check\ncorr <- cor(valid_df)\n# uncorrelateds are 140 and 300\n\ncor(valid_df[,1], valid_df[,80])\n\nuncor <- c(1, 80)\n\nval_bag <- rowMeans(valid_df)\n\nperformance <- data.frame(valid_df, val_mean = val_bag, as.data.frame(train$SalePrice))\nperformance <- data.frame(valid_df[, uncor], val_mean = val_bag, as.data.frame(valid$SalePrice))\n\n\nlibrary(ggplot2)\nlibrary(reshape2)\nlibrary(magrittr)\n\nm_per <- melt(performance, c(ncol(performance) - 1, ncol(performance)))\n\n# probably deleate\n#m_per$variable <- as.numeric(m_per$variable)\n\nm_per[order(m_per$SalePrice),] %>%\n ggplot() +\n geom_point(aes(x = seq(1, nrow(m_per)), y = value, color = variable), alpha = 0.3) +\n geom_point(aes(x = seq(1, nrow(m_per)), y = SalePrice), col = \"blue\") +\n geom_point(aes(x = seq(1, nrow(m_per)), y = val_mean),color = \"black\", alpha = 0.8, size = .5) +\n scale_color_discrete(guide=FALSE) +\n #theme_grey() +\n xlim(c(6000,6750)) +\n ylim(c(200000, 800000))\n\n\n\n\n\n\n\n\n\n\n\n\n\n", - "created" : 1495998147992.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "513372949", - "id" : "44DC5155", - "lastKnownWriteTime" : 1496021566, - "last_content_update" : 1496021566476, - "path" : "C:/Users/Andy/Desktop/auto/start_setup.R", - "project_path" : null, - "properties" : { - }, - "relative_order" : 4, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/48451B7A b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/48451B7A deleted file mode 100644 index c4ead5c..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/48451B7A +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' hist_text\n#'\n#' Internal function used in plotting mlblob objects. Gets model id from h2o model object\n#' @param id_final Character Model Id.\n#' @param hist_melted data.frame object contianing long form training history of H2O model.\n#' @return Character object Model id.\n#' @export\nhist_text <- function(id_final, hist_melted) {\n sub <- hist_melted[hist_melted$variable %in% id_final,]\n lab_x <- max(sub$iteration)\n lab_y <- sub$value[which(sub$iteration == lab_x)]\n col_x <- rep(lab_x, nrow(sub))\n col_y <- rep(lab_y, nrow(sub))\n sub$lab_x <- col_x\n sub$lab_y <- col_y\n sub$lab_x[which(sub$iteration != lab_x)] <- NA\n sub$lab_y[which(sub$iteration != lab_x)] <- NA\n sub\n}\n", - "created" : 1496009323907.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2222755855", - "id" : "48451B7A", - "lastKnownWriteTime" : 1496013409, - "last_content_update" : 1496013409793, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/hist-text.R", - "project_path" : "R/hist-text.R", - "properties" : { - }, - "relative_order" : 14, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/5866E996 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/5866E996 deleted file mode 100644 index c6f89e5..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/5866E996 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' startml\n#'\n#' startml is designed to run automatic hyperparameter searches for deep leaning\n#' gradient boosted machine, and random forest models. It selects best models, and combines\n#' or ensembles them in hopes making good predictions from an ensemble or highly skilled single\n#' model using just one function call. Machine learning algorithms are provided by h2o and\n#' run on the h2o JVM platform outside of the R workspace. Thus, much of the functionalies in startml are\n#' scalable. Currently, startml only supports regression and binary classification.\n#'\n#' @param labeled_data H2O frame object containing labeled data for model training.\n#' No Default.\n#' @param newdata H2O frame object containing unlabeled data for model predictions.\n#' No Default.\n#' @param y Character object of length 1 identifying the column name of the target variable. No Default.\n#' @param y_type Character object of length 1 identifying the type of data the target variable is. Can be \"continuous\" or \"discrete.\" Coninuous automatically creates regression models, and discrete automatically creates binomial models. Currently, startml only supports regression and binary classification.\n#' @param x Character object of length 1 or more identifying the column name(s) of the input variables. Default NULL, uses all remaining variables in labeled_data as inputs. Newdata must contian all of these input column names.\n#' @param label_id Character object of length 1 identifying the name of the column of observation IDs in labeled_data. If used, must match column of same name in newdata. startml will ignore this column as an input, but include it as an ID column in prediction outputs.\n#' @param algorithms Character object of length 3, 2, or 1, specifying which alrogrithms to automatically train. The autotrain function will run a separate grid search for each algorimth type. Choices are: \"deeplearning\", \"randomForest\", and \"gbm\" following the naming convention in H2O version 3. Defaults to c(\"deeplearning\", \"randomForest\", \"gbm\").\n#' @param eval_metric Character object defining evaluation metric for training. Defualt is \"AUTO\" and uses built-in H2O automatic choice for target data type.\n#' @param validation_type Defines validation type for training models. Defaults to \"shared_holdout\" indicating all model built with all algorithms share the same validation set. Currently, this is the only option in autotrain. Planned types include \"random_holdout\" where each model will get a unique randomized sample of labeled data for validation, and \"xval\" in which the cross validation functionality in H2O will be implemented in every model.\n#' @param percent_valid_holdout Numeric object of value 0 to 100. Sets the percent of the labeled data that will be used for holdout validation. Default is 10. Is ignored if validation_type = \"xval.\" Currently startml only supports \"shared_holdout\" validation.\n#' @param percent_test_holdout Numeric object of value 0 to 100. Sets the percent of the labeled data that will be used for test holdout for model selection. Default is 10.\n#' @param runtime_secs Character Object which sets the length of time each grid search will run. Defaults to 20, thus the default runtime is 20 sec * (length of algorimths) = 1 minute.\n#' @param split_seed Random seed for splitting labeled data into train, validation, and test components. Currently, startml only supports random sampling splits, this argument sets the random seed for these splits, making the data set separation process reproducible. Since this is a \"naive\" random split, labeled data should be shuffled before hand.\n#' @param wd Character object defining file path where resulting modeling will be saved. Defualts to current working directory.\n#' @param number_top_models Numeric object indicating number of top models to return. Defualt is 10. If number entered is greater than number of model, whole model list is returned.\n#' @param eval_threshold Numeric object defining the performance threshold models must meet to be used in prediction. Is minimum for maximization loss function (i.e., AUC) and maximum for minimization loss functions (logloss, MSE, etc). Default is NULL, returns models without performance consideration.\n#' @param correlation_threshold Numeric object defining the maximum person correlation allowed in the group of resulting models. If two models show high correlation, the one with surperior performance will be kept and the other dropped. Value ranges from -1 to 1, default is NULL, returning models without correlation considered.\n#' @param return_dataframe Boolean, if TRUE startml will attempt to return a data.frame of the resulting predictions for each new data row. This will only work if the resulting predictions from new data are small enough to be stored in the R workspace. Though, when working with smaller datasets, such as some competitions, this can be very convient. The same object is stored in the H2O space and can be accessed with the name set as the ouput of startml and manipulated with functions from the h2o R package. Default is FALSE.\n#' @return Object of class mlblob using S4 type. mlblob objects contain all selected models, their predictions on train, validation, test, and new data, and can be plotted using plot() showing a summary of the model group.\n#' @export\nstartml <- function(labeled_data,\n newdata,\n y,\n x = NULL,\n label_id = NULL,\n y_type,\n algorithms = c(\"deeplearning\", \"randomForest\", \"gbm\"),\n eval_metric = \"AUTO\",\n validation_type = \"shared_holdout\", # add RandomHoldout and cv\n percent_valid_holdout = 10,\n percent_test_holdout = 10,\n runtime_secs = 10,\n split_seed = NULL,\n number_top_models = NULL,\n eval_threshold = NULL,\n correlation_threshold = 0,\n return_dataframe = FALSE,\n wd = getwd()) {\n\n if(validation_type == \"shared_holdout\" && is.null(split_seed)) {\n stop(\"Set 'split_seed' to any real number for common random sampling when validation = SharedHoldout\")\n }\n\n # This needs to be replaced ==========================================\n # only works with shared holdout.\n # need condition for other holdout.\n if(validation_type == \"shared_holdout\" | validation_type == \"random_holdout\") {\n splits <- h2o.splitFrame(labeled_data,\n c((1 - ((percent_valid_holdout/100) + (percent_test_holdout/100))),\n (percent_test_holdout/100)),\n seed = split_seed)\n train <- h2o.assign(splits[[1]], \"train.hex\")\n valid <- h2o.assign(splits[[2]], \"valid.hex\")\n test <- h2o.assign(splits[[3]], \"test.hex\")\n } else if(validation_type == \"xval\") {\n splits <- h2o.splitFrame(train, 1 - (percent_test_holdout/100), seed = split_seed)\n train <- h2o.assign(splits[[1]], \"train.hex\")\n test <- h2o.assign(splits[[2]], \"test.hex\")\n } else {\n stop(\"Choose 'shared_holdout', 'random_holdout', or 'xval' for validation_type\")\n }\n\n # define x as all others if not specified\n if(is.null(x)) {\n x <- setdiff(names(labeled_data), y)\n }\n\n if(!is.null(label_id)) {\n if(sum(x %in% label_id) > 0) {\n x <- x[-which(x == label_id)]\n } else {\n x <- x\n }\n }\n\n #===============================================\n # commented out for now, works when startml\n # is stand alone functin, does not as\n # part of package. Potential solves: something\n # to do with methods package or versions of\n # dependencies\n # set variable type for proper auto options\n #if(y_type == \"discrete\") {\n # train[,y] <- as.factor(train[,y])\n # valid[,y] <- as.factor(valid[,y])\n # test[,y] <- as.factor(test[,y])\n #} else {\n # train[,y] <- as.numeric(train[,y])\n # valid[,y] <- as.numeric(valid[,y])\n # test[,y] <- as.numeric(test[,y])\n #}\n #===============================================\n\n #==============================================\n # other fix is considering removing y_type argument\n # in favor of doing this during data prep outside\n # startml.\n #==============================================\n\n all_models <- autotrain(train = train,\n valid = valid,\n y = y,\n x = x,\n algorithms = algorithms,\n eval_metric = eval_metric,\n runtime_secs = runtime_secs,\n wd = wd)\n # ===================================================================\n\n if(!is.null(number_top_models)) {\n cat(\"\\nChoosing Top Performing Models on Validation\")\n sorted_models <- sort_models(all_models,\n eval_metric = eval_metric)\n selected_models <- top_models(sorted_models,\n all_models,\n number_top_models = number_top_models)\n } else {\n cat(\"\\nChoosing Models on Test based on Performance and Correlation Thresholds\\n\")\n selected_models <- select_models(model_list = all_models,\n test = test,\n eval_metric = eval_metric,\n eval_threshold = eval_threshold,\n y = y,\n correlation_threshold = correlation_threshold)\n }\n cat(\"\\nSaving Train Predictions with Selected Models\\n\")\n train_predictions <- predict_blob(test = train, selected_models)\n cat(\"\\nSaving Valid Predictions with Selected Models\\n\")\n valid_predictions <- predict_blob(test = valid, selected_models)\n cat(\"\\nSaving Test Predictions with Selected Models\\n\")\n test_predictions <- predict_blob(test = test, selected_models)\n cat(\"\\nPredicting on New Data with Selected Models\\n\")\n newdata_predictions <- predict_blob(test = new_data, selected_models)\n\n # needs work.\n # make the index dataframe, trivially all 1s for shared holout\n index = data.frame(model_num = seq(1, length(selected_models)),\n train_id = rep(1, length(selected_models)),\n valid_id = rep(1, length(selected_models)),\n test_id = rep(1, length(selected_models)))\n\n # =================================================\n if(return_dataframe == FALSE) {\n # build the output object of new class mlblob\n mlout <- new(\"mlblob\",\n models = selected_models,\n labeled_data = list(labeled_data),\n train = list(train),\n valid = list(valid),\n test = list(test),\n new_data = list(new_data),\n predict_train = train_predictions,\n predict_valid = valid_predictions,\n predict_test = test_predictions,\n predict_newdata = newdata_predictions,\n index = index,\n y = y,\n x = x,\n output = data.frame(mlblob.output = \"No R object Returned, set return_dataframe to TRUE\"))\n } else {\n warning(\"Returning R object in currently in the works\")\n }\n mlout\n}\n", - "created" : 1496019390344.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "1093495522", - "id" : "5866E996", - "lastKnownWriteTime" : 1496021425, - "last_content_update" : 1496021425314, - "path" : "C:/Users/Andy/Desktop/auto/startml/R/startml.R", - "project_path" : null, - "properties" : { - }, - "relative_order" : 26, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/6A5373C7 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/6A5373C7 deleted file mode 100644 index 1f74adf..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/6A5373C7 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' plot_dlayer\n\n#' Visualize hidden layer activations in a deep learning model with dimentionality reduction and variable labels\n#' @param model H2O model object containing labeled data for model training.\n#' No Default.\n#' @param vis_data H2O frame object containing data to caculate layer activations.\n#' No Default.\n#' @param layer Numeric object of length 1 identifying the which hidden layer to visualize.\n#' No Defalut.\n#' @param label Character object of length 1 identifying the column name of the variable in vis_data to label visulaizaion points with. No Default.\n#' @param dimentions Numeric object set to 2 or 3, 2 returns ggplot figure, 3 returns plotly html page\n#' @param max_points Numeric object setting maximum number of observations in visualization. A number of rows equal to max_points from vis_data are sampled with out replacement for the visulaization.\n#' @param tsne_iter Numberic object sets the number of iterations in t-tailed stochastic nearest neighbors dimentionality reduction operation. Defaults to 1000.\n#' @param wd Character object defining file path where html interactive graphic will be saved if dimentions = 3. Defaults to current working directory.\n#' @return None\n#' @export\nplot_dlayer <- function(model,\n vis_data,\n layer,\n label,\n dimentions = 2,\n max_points = 1000,\n tsne_iter = 1000,\n wd = getwd()) {\n samp <- sample(x = 1:nrow(vis_data), size = max_points)\n samp <- samp[order(samp)]\n view_label <- vis_data[samp, label]\n names(view_label) <- label\n dat <- vis_data[samp,]\n cat(\"Getting hidden layer values from DL model\\n\")\n dlayer <- as.data.frame(h2o.deepfeatures(model, dat, layer = layer))\n if(dimentions == 2) {\n cat(\"Starting tSNE dimentionality reduction\")\n dat_tsne <- tsne(X = dlayer,\n k = dimentions,\n initial_dims = dim(dlayer)[2],\n max_iter = tsne_iter)\n vis_label <- as.data.frame(view_label)[,1]\n dat_plot <- data.frame(dat_tsne, vis_label)\n names(dat_plot) <- c(\"dl.hl.1\", \"dl.hl.2\", \"vis_label\")\n p <- ggplot(dat_plot) +\n geom_point(aes(x = dl.hl.1, y = dl.hl.2, color = vis_label),\n alpha = 0.7) +\n xlab(\"tSNE Dim 1\") +\n ylab(\"tSNE Dim 2\") +\n ggtitle(\"tSNE Dimentions of DL model Hidden Layer\") +\n guides(color = guide_legend(title = label)) +\n theme_classic(base_size = 12)\n p\n } else if(dimentions == 3) {\n cat(\"Starting tSNE dimentionality reduction\")\n dat_tsne <- tsne(X = dlayer,\n k = dimentions,\n initial_dims = dim(dlayer)[2],\n max_iter = tsne_iter)\n vis_label <- as.data.frame(view_label)[,1]\n\n dat_plot <- data.frame(dat_tsne, vis_label)\n names(dat_plot) <- c(\"dl.hl.1\", \"dl.hl.2\", \"dl.hl.3\", \"vis_label\")\n p <- plot_ly(dat_plot, x = ~dl.hl.1, y = ~dl.hl.2, z = ~dl.hl.3, color = ~vis_label) %>%\n add_markers() %>%\n layout(scene = list(xaxis = list(title = \"tSNE Dim 1\"),\n yaxis = list(title = \"tSNE Dim 2\"),\n zaxis = list(title = \"tSNE Dim 3\")))\n cat(paste(\"3D plot is saved as html page:\\n\", wd, \"/\", \"dl-hidden-layer-plot.html\\n\",\n \"open it there with your browser\", sep = \"\"))\n saveWidget(as_widget(p), \"dl-hidden-layer-plot.html\")\n } else {\n stop(\"Dimentions must be set to 2 or 3\")\n }\n}\n", - "created" : 1495998469065.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "565504198", - "id" : "6A5373C7", - "lastKnownWriteTime" : 1496000621, - "last_content_update" : 1496000621930, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/plot-dlayer.R", - "project_path" : "R/plot-dlayer.R", - "properties" : { - }, - "relative_order" : 6, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/80BD2784 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/80BD2784 deleted file mode 100644 index 476b4f6..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/80BD2784 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' select_models\n#'\n# Gets validation metrics from a list of h2o models.\n#'\n#' @param model_list List object of H2O frames containing h2o model objects. No Default.\n#' @param eval_metric Character object one of logloss, MSE, RMSE, MAE, AUC, or mean_per_class_error.\n#' @return List object same length as model_list containing performance of each model on validation data with selected metric.\n#' @export\nvalid_metric <- function(model_list, eval_metric) {\n if(eval_metric == \"logloss\") {\n metric <- lapply(model_list, h2o.logloss, valid = TRUE)\n } else if(eval_metric == \"MSE\") {\n metric <- lapply(model_list, h2o.mse, valid = TRUE)\n } else if(eval_metric == \"RMSE\") {\n metric <- lapply(model_list, h2o.rmse, valid = TRUE)\n } else if(eval_metric == \"MAE\") {\n metric <- lapply(model_list, h2o.mae, valid = TRUE)\n } else if(eval_metric == \"AUC\") {\n metric <- lapply(model_list, h2o.auc, valid = TRUE)\n } else if(eval_metric == \"mean_per_class_error\") {\n metric <- lapply(model_list, h2o.mean_per_class_error, valid = TRUE)\n }else if(eval_metric == \"RMSLE\") {\n metric <- lapply(model_list, h2o.rmsle, valid = TRUE)\n } else {\n stop(\"Choose an eval metric: logloss, MSE, RMSE, MAE, AUC, mean_per_class_error\")\n }\n metric\n}\n", - "created" : 1496013007664.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "271883752", - "id" : "80BD2784", - "lastKnownWriteTime" : 1496013350, - "last_content_update" : 1496013350737, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/valid-metric.R", - "project_path" : "R/valid-metric.R", - "properties" : { - }, - "relative_order" : 22, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/843B53AD b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/843B53AD deleted file mode 100644 index b647c49..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/843B53AD +++ /dev/null @@ -1,21 +0,0 @@ -{ - "collab_server" : "", - "contents" : "", - "created" : 1496019580679.000, - "dirty" : false, - "encoding" : "", - "folds" : "", - "hash" : "0", - "id" : "843B53AD", - "lastKnownWriteTime" : 0, - "last_content_update" : 1496019580679, - "path" : null, - "project_path" : null, - "properties" : { - "context" : ".GlobalEnv:::startml" - }, - "relative_order" : 27, - "source_on_save" : false, - "source_window" : "", - "type" : "r_code_browser" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/8A952F6E b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/8A952F6E deleted file mode 100644 index 67b6a45..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/8A952F6E +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' load_models\n#'\n#' Loads saved text file h2o models into R workspace and h2o instance\n#' @param path Character object of file path to folder containing one or more saved h2o model files.\n#' @return List object of h2o model objects\n#' @export\nload_models <- function(path) {\n all_model_files <- list.files(path, full.names = TRUE)\n all_models <- sapply(all_model_files, function(m) h2o.loadModel(m))\n all_models\n}\n", - "created" : 1496009461684.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2267106341", - "id" : "8A952F6E", - "lastKnownWriteTime" : 1496009669, - "last_content_update" : 1496009669319, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/load-models.R", - "project_path" : "R/load-models.R", - "properties" : { - }, - "relative_order" : 15, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/910ADB2B b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/910ADB2B deleted file mode 100644 index adafe14..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/910ADB2B +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' get_model_id\n#'\n#' Internal function used in plotting mlblob objects. Gets model id from h2o model object\n#' @param x h2o model. No Default.\n#' @return Character object Model id.\n#' @export\nget_model_id <- function(x) {\n model_id <- x@model_id\n model_id\n}\n", - "created" : 1496009253857.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "3506055092", - "id" : "910ADB2B", - "lastKnownWriteTime" : 1496009310, - "last_content_update" : 1496009310775, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/get-model-id.R", - "project_path" : "R/get-model-id.R", - "properties" : { - }, - "relative_order" : 13, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/98EBD2C8 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/98EBD2C8 deleted file mode 100644 index 07ec900..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/98EBD2C8 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' mlblob\n#' S4 object designed to keep track of all models and data\n# needs work\n#' @export\nsetClass(\"mlblob\", slots = c(models = \"list\",\n labeled_data = \"list\",\n train = \"list\",\n valid = \"list\",\n test = \"list\",\n new_data = \"list\",\n predict_train = \"list\",\n predict_valid = \"list\",\n predict_test = \"list\",\n predict_newdata = \"list\",\n index = \"data.frame\",\n y = \"character\",\n x = \"character\",\n output = \"data.frame\"))\n", - "created" : 1496009713858.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2159080436", - "id" : "98EBD2C8", - "lastKnownWriteTime" : 1496010008, - "last_content_update" : 1496010008695, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/mlclass.R", - "project_path" : "R/mlclass.R", - "properties" : { - }, - "relative_order" : 16, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9C72EE29 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9C72EE29 deleted file mode 100644 index 05f4a71..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9C72EE29 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' get_hist\n#'\n#' Internal function used in plotting mlblob objects. Retries scoring history from validation given h2o model\n#'\n#' @param x model from mlblob. No Default.\n#' @return Validation history of the model defined by input\n#' @export\nget_hist <- function(x) {\n x@model$scoring_history$validation_rmse\n}\n", - "created" : 1496009123640.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "914124838", - "id" : "9C72EE29", - "lastKnownWriteTime" : 1496009129, - "last_content_update" : 1496009129477, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/get-hist.R", - "project_path" : "R/get-hist.R", - "properties" : { - }, - "relative_order" : 12, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9DC15287 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9DC15287 deleted file mode 100644 index 29b7116..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/9DC15287 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "# Generated by roxygen2: do not edit by hand\n\nexport(autotrain)\nexport(dl_autogrid)\nexport(even_lengths)\nexport(gbm_autogrid)\nexport(get_frame)\nexport(get_hist)\nexport(get_ids)\nexport(get_model_id)\nexport(hist_text)\nexport(load_models)\nexport(paste_nas)\nexport(plot)\nexport(plot_dlayer)\nexport(predict_blob)\nexport(rf_autogrid)\nexport(select_models)\nexport(sort_models)\nexport(startml)\nexport(test_metric_h2o)\nexport(top_models)\nexport(valid_metric)\nexportClasses(mlblob)\n", - "created" : 1495997856957.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2529648773", - "id" : "9DC15287", - "lastKnownWriteTime" : 1496015915, - "last_content_update" : 1496015915, - "path" : "C:/Users/Andy/Desktop/r-package/startml/NAMESPACE", - "project_path" : "NAMESPACE", - "properties" : { - }, - "relative_order" : 3, - "source_on_save" : false, - "source_window" : "", - "type" : "r_namespace" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A2728057 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A2728057 deleted file mode 100644 index 167264f..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A2728057 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "Package: startml\nType: Package\nTitle: Get started on training optimization and ensembles with popular machine learning algorithms powered by h2o\nVersion: 0.1.0\nAuthor: Andrew R Sommerlot \nMaintainer: Andrew R Sommerlot \nDescription: Get started on training optimization and ensembles with popular machine learning algorithms, powered by h2o. \n Why startml? Anyone who really wants to build machine learning models and apply them in a valuable way to real-word problems \n can with the help of open source technology. What startml does is speed up the learning-through-doing \n process for new comers. How this happens is by providing tools for automatically building many models, \n helping you put some miles on your CPU doing machine learning, and hopefully, giving you a little \n boost to some early success, inspiring more effort towards building a valuable knowledge and skill set.\nDepends: R (>= 3.1.0)\nImports: \n h2o,\n ggplot2,\n reshape2,\n plyr,\n gridExtra,\n plotly,\n tsne, \n methods, \n htmlwidgets\nLicense: GPL 3\nEncoding: UTF-8\nLazyData: true\nRoxygenNote: 6.0.1\n", - "created" : 1495995433294.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "1001683836", - "id" : "A2728057", - "lastKnownWriteTime" : 1495998432, - "last_content_update" : 1495998432690, - "path" : "C:/Users/Andy/Desktop/r-package/startml/DESCRIPTION", - "project_path" : "DESCRIPTION", - "properties" : { - }, - "relative_order" : 2, - "source_on_save" : false, - "source_window" : "", - "type" : "dcf" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A37A2442 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A37A2442 deleted file mode 100644 index 4384f3f..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/A37A2442 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' rf_autogrid\n#'\n#' rf_autogrid is a wrapper employing built-in settings to run grid search hyper parameter optimizations on the random forest algorithm.\n#'\n#' @param train H2O frame object containing labeled data for model training.\n#' No Default.\n#' @param valid H2O frame object containing labeled data for model validation.\n#' No Default.\n#' @param y Character object of length 1 identifying the column name of the target variable. No Default.\n#' @param x Character object of length 1 or more identifying the column name(s) of the input variables. No Default.\n#' @param folds Character object defining number of folds for xval. Default is NULL and currently is not implemented.\n#' @param rf_runtime_secs Numeric object defining total number of seconds the hyper parameter grid search will run.\n#' @param rf_stopping_rounds Numeric object defining maximum number of training rounds an individual deep learning model not improving will continue to run. Default is 10.\n#' @param rf_stopping_tolerance Numeric object which sets the mimmum loss funciton improvement for a training iteration to be considered an improvement. Defulat is 1E-5.\n#' @param rf_min_depth Numeric object which sets the minimum tree depth for all random forest models. Defaut is 1.\n#' @param rf_max_depth Numeric object which sets the maximum tree depth for all random forest models. Defaut is 7.\n#' @param grid_strategy Character object default and only current supported option is \"randomDiscrete\"\n#' @param eval_metric Character object defining evaluation metric for training. Defualt is \"AUTO\" and uses built-in H2O automatic choice for target data type.\n#' @param wd Character object defining file path where dl_models folder will be created and deep learning models saved. Defaults to current working directory.\n#' @return List object containing H2O model objects. Additionally saves h2o models as re-loadable text files in wd/rf_models folder.\n#' @export\nrf_autogrid <- function(train,\n valid,\n y,\n x,\n eval_metric = \"AUTO\",\n wd = getwd(),\n folds = NULL,\n rf_min_depth = 1,\n rf_max_depth = 7,\n rf_runtime_secs = 20,\n rf_stopping_rounds = 10,\n rf_stopping_tolerance = 1e-5,\n grid_strategy = \"RandomDiscrete\") {\n\n cat(\"Training Random Forest Models\\n\")\n### here for grid\n rf_parameter_search <- list(max_depth = seq(rf_min_depth, rf_max_depth, 1),\n sample_rate = c(0.2, 0.4, 0.5, 0.7, 0.9),\n col_sample_rate_per_tree = c(0.2, 0.4, 0.5, 0.9, 1),\n col_sample_rate_change_per_level = c(0.2, 0.4, 0.5, 0.9, 1),\n min_rows = 2^seq(0,log2(nrow(train))-1,1),\n nbins = 2^seq(4,10,1),\n nbins_cats = 2^seq(4,12,1),\n min_split_improvement = c(0,1e-8,1e-6,1e-4),\n histogram_type = c(\"UniformAdaptive\",\"QuantilesGlobal\",\"RoundRobin\"))\n\n rf_search_criteria <- list(strategy = grid_strategy,\n max_runtime_secs = rf_runtime_secs,\n stopping_rounds = rf_stopping_rounds,\n stopping_tolerance = rf_stopping_tolerance,\n stopping_metric = eval_metric,\n seed = 1234)\n\n # needs be removed first for iterating within same session\n h2o.rm(\"rf\")\n rf_random_grid <- h2o.grid(hyper_params = rf_parameter_search,\n search_criteria = rf_search_criteria,\n algorithm = \"randomForest\",\n grid_id = \"rf\",\n x = x,\n y = y,\n training_frame = train,\n validation_frame = valid,\n ntrees = 4000, # must be changable\n seed = 1234) # must change\n\n #================================================\n #rf_grid <- h2o.getGrid(\"rf\")\n\n # write out the models to disk\n rf_path <- paste(wd, \"/rf_models\", sep = \"\")\n rf_model_files <- sapply(rf_random_grid@model_ids, function(m) h2o.saveModel(h2o.getModel(m), path = rf_path, force = TRUE))\n\n # print out, needs work\n cat(paste(\"Random Forest Models Saved To:\\n\", rf_path, \"\\n\\n\"))\n rf_random_grid\n}\n", - "created" : 1496010462465.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "1117883088", - "id" : "A37A2442", - "lastKnownWriteTime" : 1496011085, - "last_content_update" : 1496011085538, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/rf-autogrid.R", - "project_path" : "R/rf-autogrid.R", - "properties" : { - }, - "relative_order" : 18, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AA01AE68 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AA01AE68 deleted file mode 100644 index fd38802..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AA01AE68 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' top_models\n#'\n#' Selects best n models by performance threshold\n#'\n#' @param sorted_models Data frame object indexing all models sorted by decreasing performance. No Default.\n#' @param model_list List object of H2O frames containing h2o model objects.\n#' No Default.\n#' @param number_top_models Numeric object indicating number of top models to return. Defualt is 10. If number entered is greater than number of model, whole model list is returned.\n#' @return List object same length as number_top_models containing top performing h2o models from model_list.\n#' @export\ntop_models <- function(sorted_models, model_list, number_top_models = 10) {\n all_top_models <- model_list[sorted_models$model_list_num[1:number_top_models]]\n all_top_models\n}\n", - "created" : 1496012698199.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2314605201", - "id" : "AA01AE68", - "lastKnownWriteTime" : 1496013170, - "last_content_update" : 1496013170509, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/top-models.R", - "project_path" : "R/top-models.R", - "properties" : { - }, - "relative_order" : 21, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AF6B5159 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AF6B5159 deleted file mode 100644 index df5304f..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/AF6B5159 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' get_frame\n#'\n#' get_frame Returns an r data frame having an id and prediction column from a model\n#'\n#' @param mlout startml mlblob object from startml function. No Default.\n#' @param new_data_id Character object of length 1 identifying the name of id varialbe. No default.\n#' @param output_models Character object of varying length identifying which model names to get prections from. No default.\n#' @param csv Boolean, whether or not a csv file named \"new_data_predictions_\n#' @param wd Character object defining file path where dl_models folder will be created and deep learning models saved. Defaults to current working directory.\n#' @return R data.frame object with an id column and a single column for each model's predcitions.\n#' @export\nget_frame <- function(mlout, new_data_id = id, output_models, csv = TRUE, wd) {\n h2o_list <- mlout@predict_newdata\n r_list <- lapply(h2o_list, FUN = as.data.frame)\n preds_all_models <- do.call(cbind, r_list)\n name_list <- unlist(lapply(mlout@models, FUN = get_model_id))\n names(preds_all_models) <- name_list\n new_data_id <- as.data.frame(test$Id)[,1]\n new_data_predictions <- data.frame(id = new_data_id,\n preds_all_models)\n id_column_name <- \"Id\" # change this to id argument\n prediction_column_name <- y\n print_out <- new_data_predictions[,c(1,which(names(new_data_predictions) %in% output_models))]\n names(print_out) <- c(id_column_name, prediction_column_name)\n if(csv == TRUE) {\n write.csv(print_out, paste(wd, \"/new_data_predictions_\", output_model, \".csv\", sep = \"\"))\n }\n print_out\n}\n", - "created" : 1496000810517.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2800689058", - "id" : "AF6B5159", - "lastKnownWriteTime" : 1496003458, - "last_content_update" : 1496003458651, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/get-frame.R", - "project_path" : "R/get-frame.R", - "properties" : { - }, - "relative_order" : 10, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/B5A6170B b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/B5A6170B deleted file mode 100644 index e27bc0a..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/B5A6170B +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' get_ids\n#'\n#' Internal function used in plotting mlblob objects. Gets list of model ids from mlblob object.\n#'\n#' @param x model from mlblob. No Default.\n#' @return Validation history of the model defined by input\n#' @export\nget_ids <- function(x) {\n x@model$id\n}\n", - "created" : 1496009119392.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "3403187478", - "id" : "B5A6170B", - "lastKnownWriteTime" : 1496009192, - "last_content_update" : 1496009192050, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/get-ids.R", - "project_path" : "R/get-ids.R", - "properties" : { - }, - "relative_order" : 11, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/C2483659 b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/C2483659 deleted file mode 100644 index 9599c70..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/C2483659 +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' paste-nas\n#'\n#' Internal function used in plotting mlblob objects. Handles nas.\n#'\n#' @param x Character or Numeric object No Defalut.\n#' @param longest Length of character or numeric object to extend x to with nas. No Default.\n#' @return Validation history of the model defined by input\n#' @export\npaste_nas <- function(x, longest) {\n x_na <- c(x, rep(NA, longest - length(x)))\n x_na\n}\n", - "created" : 1496010040579.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "3928911382", - "id" : "C2483659", - "lastKnownWriteTime" : 1496010153, - "last_content_update" : 1496010153921, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/paste-nas.R", - "project_path" : "R/paste-nas.R", - "properties" : { - }, - "relative_order" : 17, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/D29C1B6D b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/D29C1B6D deleted file mode 100644 index 7393b86..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/D29C1B6D +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' gbm_autogrid\n#'\n#' gbm_autogrid is a wrapper employing built-in settings to run grid search hyper parameter optimizations on gradient boosted machine algorithm.\n#'\n#' @param train H2O frame object containing labeled data for model training.\n#' No Default.\n#' @param valid H2O frame object containing labeled data for model validation.\n#' No Default.\n#' @param y Character object of length 1 identifying the column name of the target variable. No Default.\n#' @param x Character object of length 1 or more identifying the column name(s) of the input variables. No Default.\n#' @param folds Character object defining number of folds for xval. Default is NULL and currently is not implemented.\n#' @param gbm_runtime_secs Numeric object defining total number of seconds the hyper parameter grid search will run.\n#' @param gbm_stopping_rounds Numeric object defining maximum number of training rounds an individual deep learning model not improving will continue to run. Default is 10.\n#' @param gbm_stopping_tolerance Numeric object which sets the mimmum loss funciton improvement for a training iteration to be considered an improvement. Defulat is 1E-5.\n#' @param gbm_min_depth Numeric object which sets the mimmum loss funciton improvement for a training iteration to be considered an improvement. Defulat is 1E-5.\n#' @param gbm_max_depth Numeric object which sets the maximum tree depth for all gbm models. Defulat is 7.\n#' @param grid_strategy Character object default and only current supported option is \"randomDiscrete\"\n#' @param eval_metric Character object defining evaluation metric for training. Defualt is \"AUTO\" and uses built-in H2O automatic choice for target data type.\n#' @param wd Character object defining file path where dl_models folder will be created and deep learning models saved. Defaults to current working directory.\n#' @return List object containing H2O model objects. Additionally saves h2o models as re-loadable text files in wd/gbm_models folder.\n#' @export\ngbm_autogrid <- function(train,\n valid,\n y,\n x,\n eval_metric = \"AUTO\",\n wd = getwd(),\n folds = NULL,\n gbm_min_depth = 1,\n gbm_max_depth = 7,\n gbm_runtime_secs = 10,\n gbm_stopping_rounds = 10,\n gbm_stopping_tolerance = 1e-5,\n grid_strategy = \"RandomDiscrete\") {\n\n cat(\"Training Gradient Boosting Models\\n\")\n #============================================================\n # needs to be reviewed for smart values and changable...\n # score_tree_interval = c(2, 5, 10),\n gbm_parameter_search = list(\n max_depth = seq(gbm_min_depth, gbm_max_depth, 1),\n sample_rate = seq(0.2, 1, 0.01),\n col_sample_rate = seq(0.2,0.9,1),\n col_sample_rate_per_tree = seq(0.2, 0.9, 1),\n col_sample_rate_change_per_level = seq(0.9,1.1,0.01),\n min_rows = 2^seq(0,log2(nrow(train))-1,1),\n nbins = 2^seq(4,10,1),\n nbins_cats = 2^seq(4,12,1),\n min_split_improvement = c(0,1e-8,1e-6,1e-4),\n learn_rate = c(0.1, 0.01, 0.001),\n learn_rate_annealing = c(0.1, 0.5, .99), ## check this for reasonableness\n histogram_type = c(\"UniformAdaptive\",\"QuantilesGlobal\",\"RoundRobin\")\n )\n\n gbm_search_criteria = list(\n strategy = grid_strategy,\n max_runtime_secs = gbm_runtime_secs,\n stopping_rounds = gbm_stopping_rounds,\n stopping_tolerance = gbm_stopping_tolerance,\n stopping_metric = eval_metric,\n seed = 1234 # needs to be changable\n )\n\n # needs be removed first for iterating within same session\n h2o.rm(\"gbm\")\n gbm_random_grid <- h2o.grid(algorithm = \"gbm\",\n grid_id = \"gbm\", # this causes failure on repreat runs, but automatic names give huge model ids\n x = x,\n y = y,\n training_frame = train,\n validation_frame = valid,\n ntrees = 4000, # has to be adjustable\n hyper_params = gbm_parameter_search,\n search_criteria = gbm_search_criteria,\n seed = 1234)\n #====================================\n #gbm_grid <- h2o.getGrid(\"gbm\") # already returns grid\n\n # write out the models to disk\n gbm_path <- paste(wd, \"/gbm_models\", sep = \"\")\n gbm_model_files <- sapply(gbm_random_grid@model_ids, function(m) h2o.saveModel(h2o.getModel(m), path = gbm_path, force = TRUE))\n\n # print out\n cat(paste(\"gbm Models Saved To:\\n\", gbm_path, \"\\n\\n\"))\n gbm_random_grid\n}\n", - "created" : 1496000715907.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "2954302158", - "id" : "D29C1B6D", - "lastKnownWriteTime" : 1496011120, - "last_content_update" : 1496011120828, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/gbm-autogrid.R", - "project_path" : "R/gbm-autogrid.R", - "properties" : { - }, - "relative_order" : 9, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EF25A0ED b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EF25A0ED deleted file mode 100644 index f76c3c7..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EF25A0ED +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' sort_models\n#'\n#' Returns a index data frame giving ranked placement best models in model_list. Internal function used for model selection.\n#'\n#' @param model_list List object of H2O frames containing h2o model objects.\n#' @param eval_metric Character object one of logloss, MSE, RMSE, MAE, AUC, or mean_per_class_error.\n#' No Default.\n#' @return Data frame indexing validation model performance of models in model_list.\n#' @export\nsort_models <- function(model_list, eval_metric) {\n metrics <- valid_metric(model_list, eval_metric = eval_metric)\n ranking <- data.frame(mod = seq(1, length(model_list), 1), metric = unlist(metrics))\n colnames(ranking) <- c(\"model_list_num\", eval_metric)\n row.names(ranking) <- NULL\n if(eval_metric == \"AUC\") {\n sorted <- ranking[order(ranking[,2], decreasing = TRUE),]\n } else {\n sorted <- ranking[order(ranking[,2], decreasing = FALSE),]\n }\n sorted\n}\n", - "created" : 1496013499728.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "1463382917", - "id" : "EF25A0ED", - "lastKnownWriteTime" : 1496013716, - "last_content_update" : 1496013716443, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/sort-models.R", - "project_path" : "R/sort-models.R", - "properties" : { - }, - "relative_order" : 24, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EFB1B61A b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EFB1B61A deleted file mode 100644 index 4854b13..0000000 --- a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/EFB1B61A +++ /dev/null @@ -1,20 +0,0 @@ -{ - "collab_server" : "", - "contents" : "#' select_models\n#'\n#' select_models identifies best models in mlblob object by performance and pearson correlation thresholds.\n#' @param model_list List object of H2O model objects to be subbsetted by performance and correlation thresholds\n#' No Default.\n#' @param test H2O frame object containing labeled data for model evaluation.\n#' No Default.\n#' @param y Character object of length 1 identifying the column name of the target variable. No Default.\n#' @param eval_metric Character object one of logloss, MSE, RMSE, MAE, AUC, or mean_per_class_error.\n#' @param eval_threshold Numeric object defining the performance threshold models must meet to be used in prediction. Is minimum for maximization loss function (i.e., AUC) and maximum for minimization loss functions (logloss, MSE, etc). Default is NULL, returns models without performance consideration.\n#' @param correlation_threshold Numeric object defining the maximum person correlation allowed in the group of resulting models. If two models show high correlation, the one with surperior performance will be kept and the other dropped. Value ranges from -1 to 1, default is NULL, returning models without correlation considered.\n#' @return List object containing H2O model objects adhearing to threshold standards set in input arguments.\n#' @export\nselect_models <- function(model_list,\n test,\n eval_metric,\n y,\n eval_threshold = NULL,\n correlation_threshold = NULL) {\n if(eval_metric == \"AUC\") {\n eval_fun <- function(a, b) {\n a >= b\n }\n } else {\n eval_fun <- function(a, b) {\n a <= b\n }\n }\n if(is.null(correlation_threshold)) {\n low_cor_models <- model_list\n } else {\n prediction_list <- predict_blob(test, model_list)\n if(eval_metric == \"AUC\" | eval_metric == \"logloss\") {\n predictions_subset <- lapply(prediction_list, function(x)x[,3])\n predictions <- h2o.cbind(predictions_subset)\n } else {\n predictions <- h2o.cbind(prediction_list)\n }\n correlations <- h2o.cor(predictions)\n names(correlations) <- seq(1:length(model_list))\n correlations[!lower.tri(correlations)] <- 0\n low_cor_models <- model_list[as.numeric(colnames(correlations[,!apply(correlations,2,\n function(x) any(x > correlation_threshold))]))]\n }\n if(length(low_cor_models) == 0){\n min_message <- min(correlations[correlations != 0])\n warning(paste(\"No models selected, minimum correlation available is\",\n min_message, \"\\nReturning models unconstrained by correlation\\n\"))\n low_cor_models <- model_list\n } else {\n if(is.null(eval_threshold)) {\n low_cor_models\n } else {\n if(!exists(\"prediction_list\")) {\n prediction_list <- predict_blob(test, model_list)\n }\n metrics <- unlist(test_metric(prediction_list, test = test, y = y, eval_metric = eval_metric))\n keep_models <- low_cor_models[eval_fun(metrics, eval_threshold)]\n if(length(keep_models) == 0){\n warning(\"eval_threshold too optimistic, returning models unconstrained by performance\")\n low_cor_models\n } else {\n keep_models\n }\n }\n }\n}\n", - "created" : 1496011037194.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "1182953486", - "id" : "EFB1B61A", - "lastKnownWriteTime" : 1496012626, - "last_content_update" : 1496012626885, - "path" : "C:/Users/Andy/Desktop/r-package/startml/R/select-models.R", - "project_path" : "R/select-models.R", - "properties" : { - }, - "relative_order" : 19, - "source_on_save" : false, - "source_window" : "", - "type" : "r_source" -} \ No newline at end of file diff --git a/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/lock_file b/.Rproj.user/8F0F1B72/sdb/s-2D52ACD6/lock_file deleted file mode 100644 index e69de29..0000000 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/DESCRIPTION b/DESCRIPTION index 18219f9..4857a30 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,6 +11,7 @@ Description: Get started on training optimization and ensembles with popular mac helping you put some miles on your CPU doing machine learning, and hopefully, giving you a little boost to some early success, inspiring more effort towards building a valuable knowledge and skill set. Depends: R (>= 3.1.0), + Metrics, ggplot2, reshape2, plyr, diff --git a/NAMESPACE b/NAMESPACE index e69ef9b..f4a59a3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,7 +18,7 @@ export(rf_autogrid) export(select_models) export(sort_models) export(startml) -export(test_metric_h2o) +export(test_metric) export(top_models) export(valid_metric) exportClasses(mlblob) diff --git a/R/test-metric.R b/R/test-metric.R index 3be571b..c42b562 100644 --- a/R/test-metric.R +++ b/R/test-metric.R @@ -1,4 +1,4 @@ -#' select_models +#' test_metric #' #' Caculate performance metrics from models on new data. Depends on Metrics package. #' @param prediction_list List object of H2O frames containing predictions. @@ -10,7 +10,7 @@ #' @return List object same length as prediction_list containing performance of each model on test input with selected metric. #' @export # get test holdout metrics from models # depends on Metrics package for now -test_metric_h2o <- function(prediction_list, test, eval_metric, y) { +test_metric <- function(prediction_list, test, eval_metric, y) { if(eval_metric == "AUC" | eval_metric == "logloss") { predictions <- lapply(prediction_list, function(x)x[,3]) } else { diff --git a/man/startml.Rd b/man/startml.Rd index f92c35b..2d6d7bb 100644 --- a/man/startml.Rd +++ b/man/startml.Rd @@ -56,17 +56,6 @@ Object of class mlblob using S4 type. mlblob objects contain all selected models Slots are: models, a list of h2o model objects labeled_data an h2o frame object equivalent to the input label_data input object. -train = A list of h2o frame objects contianing the train component for each model from the labeled data split. -valid = A list of h2o frame objects contianing the validation component for each model from the labeled data split if cross validation is not used. -test = A list of h2o frame objects contianing the test component for each model from the labeled data split. -new_data = an h2o frame object equivalent to the input new_data input object. -predict_train = A list of h2o frame objects contianing all model predictions on the trainng data. -predict_valid = A list of h2o frame objects contianing all model predictions on the validation data. -predict_test = A list of h2o frame objects contianing all model predictions on the test data. -predict_newdata = A list of h2o frame objects contianing all model predictions on the new, unlabeled data. -index = A data.frame object containing summary information of the mlblob object. -y = A character object containing the name of the target variable column in labeled data. -x = A character object containing the names of all input varialbes to be used in model building. Both labeled data and new data must each contain all of the column names specificed in x. Default is NULL and uses all variables except y and any column name specified in label_id. Do not train models with an ID column as an input. Either remove it, or specifiy it in label_id. } \description{ startml is designed to run automatic hyperparameter searches for deep leaning diff --git a/man/test_metric_h2o.Rd b/man/test_metric.Rd similarity index 85% rename from man/test_metric_h2o.Rd rename to man/test_metric.Rd index 1efe23d..164537b 100644 --- a/man/test_metric_h2o.Rd +++ b/man/test_metric.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/test-metric.R -\name{test_metric_h2o} -\alias{test_metric_h2o} -\title{select_models} +\name{test_metric} +\alias{test_metric} +\title{test_metric} \usage{ -test_metric_h2o(prediction_list, test, eval_metric, y) +test_metric(prediction_list, test, eval_metric, y) } \arguments{ \item{prediction_list}{List object of H2O frames containing predictions.