Update gapfill algorithm

- Code was slightly modified to make sure that the initial gap-fill pFBA solution is actually a feasible solution. - In case infeasible solutions are returned, the LP is solved again with lowered bound violation tolerance (refers to #232). Some further changes: - During gapfilling, the reported numbers of added reactions now does not count added exchange reactions - more conistent checking of a returned FBA solution is actually feasible
jotech · Aug 30, 2024 · 568dfa8 · 568dfa8
1 parent 354989c
commit 568dfa8
Show file tree

Hide file tree

Showing 9 changed files with 95 additions and 65 deletions.
diff --git a/gapseq_env.yml b/gapseq_env.yml
@@ -27,7 +27,7 @@ dependencies:
   - r-stringi
   - r-getopt
   - r-r.utils
-  - r-cobrar
+  - r-cobrar >=0.1.1
   - r-biocmanager
   - bioconductor-biostrings
   - r-jsonlite

diff --git a/src/adapt.R b/src/adapt.R
@@ -60,10 +60,9 @@ suppressMessages(library(tools))
 
 # select solver
 if( "cobrarCPLEX" %in% rownames(installed.packages()) ){
-  COBRAR_SETTINGS("SOLVER","cplex"); ok <- 1
+  COBRAR_SETTINGS("SOLVER","cplex"); stat <- c(1,2)
 }else{
-  warning("glpk is used as LP solver but cplex is recommended because it is much faster. If you have IBM's ILOG cplex installed, you can add cplex-support to gapseq by installing the R-package 'cobarCPLEX' (https://github.com/Waschina/cobrarCPLEX).")
-  COBRAR_SETTINGS("SOLVER","glpk"); ok <- 0
+  COBRAR_SETTINGS("SOLVER","glpk"); stat <- c(2,5)
 }
 
 # Setting defaults if required

diff --git a/src/gapfill4.R b/src/gapfill4.R
@@ -1,6 +1,8 @@
 gapfill4 <- function(mod.orig, mod.full, rxn.weights, min.gr = 0.01, bcore = 50,
                      dummy.weight = 100, script.dir, core.only = FALSE, verbose=verbose, gs.origin = NA, rXg.tab, env = "") {
 
+  presRxnWeight <- 0.00001 # Reaction weight for pFBA for reactions already present in model or exchange/demand reactions
+
   # backup model
   mod.orig.bak <- mod.orig
 
@@ -63,14 +65,14 @@ gapfill4 <- function(mod.orig, mod.full, rxn.weights, min.gr = 0.01, bcore = 50,
                 rxn.weights = rxn.weights,
                 growth.rate = 0))
   }
-  if(sol@ok != ok){
-    warning("FBA on original model did not end successfully! Zero solution is feasibile! But gapfilling will start anyway.")
+  if(sol@stat %notin% stat){
+    warning("FBA on original model did not end successfully – solution is infeasibile. Gapfilling will start anyway.")
   }
 
   sol <- fba(mod)
   gr.dummy <- sol@obj
-  if(sol@ok!=ok){
-    warning(paste0("Full model is already not able to yield a feasible solution There's no way to successful gap-filling."))
+  if(sol@stat %notin% stat){
+    warning(paste0("Full model is already not able to yield a feasible solution. There's no way to successful gap-filling."))
     return(list(model = mod.orig.bak,
                 rxns.added = c(),
                 rxn.weights = rxn.weights,
@@ -79,33 +81,65 @@ gapfill4 <- function(mod.orig, mod.full, rxn.weights, min.gr = 0.01, bcore = 50,
 
   c.coef.dt <- data.table(id = gsub("_.0$","",mod@react_id))
   c.coef.dt <- merge(c.coef.dt, mseed[,.(id, weight, core.rxn)], by.x="id", by.y="id", all.x = T, sort = F)
-  c.coef.dt[is.na(weight), weight := 0.00001]
-  c.coef.dt[id %in% pres.rxns, weight := 0.00001]
+  c.coef.dt[is.na(weight), weight := presRxnWeight]
+  c.coef.dt[id %in% pres.rxns, weight := presRxnWeight]
+  c.coef.dt[, candRxn := id %notin% pres.rxns & !grepl("^EX|^DM|^bio1$",id)] # potential new data base reactions for the model
 
   if(gs.origin==1) {
-    sol.tmp   <- 0
-    max.iter  <- 100
+    max.iter  <- 15
     n.iter    <- 1
     pFBAcoeff <- 1e-3
+    tol_bnd   <- COBRAR_SETTINGS("TOLERANCE"); min_tol_bnd <- 1e-9
+    gf.success <- FALSE
     mod@lowbnd[which(mod@obj_coef == 1)] <- min.gr # enforce minimum required growth rate
-    while(sol.tmp <= 0 & n.iter <= max.iter) {
+    while(!gf.success) {
+      # max iterations reached
+      if(n.iter > max.iter) {
+        stop("Initial gap-filling using all reactions was not successful although all available reactions were considered. Please check your gap-filling medium or consider reducing the minimum required growth rate.")
+      }
+
+      # otherwise, let's try
+      COBRAR_SETTINGS("TOLERANCE",tol_bnd)
       sol.fba <- pfbaHeuristic(mod, costcoeffw = c.coef.dt$weight,
                                pFBAcoeff = pFBAcoeff)
-
+      
       n.iter  <- n.iter + 1
-      if(sol.fba@ok==ok)
-        sol.tmp <- sol.fba@obj
-      if(sol.fba@obj <= 0.001)
-        sol.tmp <- 0
-      if(sol.tmp <= 0)
+      flxbm <- sol.fba@fluxes[which(mod@obj_coef == 1)]
+
+      if(sol.fba@stat %in% stat && flxbm > tol_bnd) {
+        # pFBA says its feasible, but let's check if solution is actually valid
+        mod.reduced <- rmReact(mod, react = mod@react_id[which(sol.fba@fluxes == 0)])
+        sol.reduced <- fba(mod.reduced)
+        flxbm.reduced <- sol.reduced@fluxes[which(mod.reduced@obj_coef == 1)]
+        if(sol.reduced@stat %notin% stat && tol_bnd > min_tol_bnd) {
+          tol_bnd <- max(tol_bnd / 2, min_tol_bnd)
+          cat("No feasible gap-fill solution found due to numeric issues. Adjusting bound tolerance to",tol_bnd,"\n")
+          next
+        } else if(sol.reduced@stat %notin% stat && tol_bnd <= min_tol_bnd) {
+          pFBAcoeff <- pFBAcoeff / 2
+          cat("No feasible gap-fill solution found due to numeric issues. Relaxing pFBA coefficient 'c' to",pFBAcoeff,"\n")
+          next
+        } else {
+          gf.success <- TRUE
+          next
+        }
+      } else if (sol.fba@stat %in% stat && flxbm <= tol_bnd) {
+        # pFBA says its feasible, but growth rate is zero. Let's try with relaxed pFBA
         pFBAcoeff <- pFBAcoeff / 2
+        cat("pFBA feasible but with growth rate of 0. Relaxing pFBA coefficient 'c' to",pFBAcoeff,"\n")
+        next
+      } 
+
+      # pFBA says its infeasible. Let's try with relaxed pFBA.
+      pFBAcoeff <- pFBAcoeff / 2
+      cat("No feasible gap-fill solution found. Relaxing pFBA coefficient 'c' to",pFBAcoeff,"\n")
     }
   } else {
     sol.fba <- pfbaHeuristic(mod, costcoeffw = c.coef.dt$weight,
                              pFBAcoeff = 1e-3)
   }
 
-  if(sol.fba@ok!=ok){
+  if(sol.fba@stat %notin% stat){
     warning("pFBA did not end successfully.")
     return(list(model = mod.orig.bak,
                 rxns.added = c(),
@@ -114,18 +148,17 @@ gapfill4 <- function(mod.orig, mod.full, rxn.weights, min.gr = 0.01, bcore = 50,
   }
 
   # Retrieve list of utilized dummy reactions and add them to the original/draft model
-  # TODO: Make this differently: How to find out which rxns are candidate reactions
-  ko.dt <- data.table(dummy.rxn = c.coef.dt[weight>0.00001,id],
-                      d.rxn.ind = which(c.coef.dt$weight>0.00001),
-                      dummy.weight = c.coef.dt[weight>0.00001,weight],
-                      flux = sol.fba@fluxes[which(c.coef.dt$weight>0.00001)])
+  ko.dt <- data.table(dummy.rxn = c.coef.dt[candRxn == TRUE,id],
+                      d.rxn.ind = which(c.coef.dt$candRxn == TRUE),
+                      dummy.weight = c.coef.dt[candRxn == TRUE,weight],
+                      flux = sol.fba@fluxes[which(c.coef.dt$candRxn == TRUE)])
 
 
   ko.dt <- ko.dt[abs(flux) > 0]
   ko.dt[, core := gsub("_.0","",dummy.rxn) %in% mseed[core.rxn==T,id]]
   cat("Utilized candidate reactions: ",nrow(ko.dt))
 
-  if( nrow(ko.dt) == 0){ # no dummy reactions is needed
+  if(nrow(ko.dt) == 0){ # no dummy reactions are needed
     warning("No dummy reactions utilized in full model. Nothing to add.")
     return(list(model = mod.orig.bak,
                 rxns.added = c(),
@@ -195,8 +228,8 @@ gapfill4 <- function(mod.orig, mod.full, rxn.weights, min.gr = 0.01, bcore = 50,
   mod.orig <- add_missing_exchanges(mod.orig)
 
   sol <- fba(mod.orig)
-  if(sol@ok!=ok | sol@obj < 1e-7){
-    warning(paste0("Final model ", mod.orig@mod_name, " cannot produce enough target even when all candidate reactions are added! obj=", sol@obj, " ok=",sol@ok))
+  if(sol@stat %notin% stat | sol@obj < 1e-7){
+    warning(paste0("Final model ", mod.orig@mod_name, " cannot produce enough target even when all candidate reactions are added! obj=", sol@obj, " stat=",sol@stat))
     return(list(model = mod.orig.bak,
                 rxns.added = c(),
                 rxn.weights = rxn.weights,
@@ -246,7 +279,7 @@ gapfill4 <- function(mod.orig, mod.full, rxn.weights, min.gr = 0.01, bcore = 50,
 
   rxns.added <- gsub("_.0","",ko.dt[keep==T | core==T,dummy.rxn])
   sol <- fba(mod.orig)
-  if(sol@ok!=ok){
+  if(sol@stat %notin% stat){
     stop("Final model cannot grow. Something is terribly wrong!")
     return(list(model = mod.orig.bak,
                 rxns.added = c(),

diff --git a/src/generate_GSdraft.R b/src/generate_GSdraft.R
@@ -232,9 +232,10 @@ build_draft_model_from_blast_results <- function(blast.res, transporter.res, bio
     }
   }
   if(input_mode == "nucl") {
+    ncont <- length(unique(dt_genes[rm == F, stitle]))
     cat(length(unique(dt_genes[rm == F & seed %in% mseed$id, paste(gene, sep="$")])),
-        "unique genes on",length(unique(dt_genes[rm == F, stitle])),
-        "genetic element(s)\n")
+        "unique genes on",ncont,
+        ifelse(ncont==1,"contig","contigs"),"\n")
   }
   if(input_mode == "prot") {
     cat(length(unique(dt_genes[rm == F & seed %in% mseed$id, paste(gene, sep="$")])),

diff --git a/src/gf.adapt.R b/src/gf.adapt.R
@@ -17,10 +17,9 @@ if (!is.na(Sys.getenv("RSTUDIO", unset = NA))) {
 
 # select solver
 if( "cobrarCPLEX" %in% rownames(installed.packages()) ){
-  COBRAR_SETTINGS("SOLVER","cplex"); ok <- 1
+  COBRAR_SETTINGS("SOLVER","cplex"); stat <- c(1,2)
 }else{
-  warning("glpk is used as LP solver but cplex is recommended because it is much faster. If you have IBM's ILOG cplex installed, you can add cplex-support to gapseq by installing the R-package 'cobarCPLEX' (https://github.com/Waschina/cobrarCPLEX).")
-  COBRAR_SETTINGS("SOLVER","glpk"); ok <- 0
+  COBRAR_SETTINGS("SOLVER","glpk"); stat <- c(2,5)
 }
 
 #setwd(script.dir)
@@ -86,7 +85,7 @@ check_lethal <- function(model, rxn_list, med.file=NA, med=NA){
         mod.tmp <- rmReact(mod.tmp, react=model@react_id[na.omit(rxn.idx)])
 
         sol <- fba(mod.tmp)
-        lethal <- !(sol@stat == ok & sol@obj >= 1e-7)  
+        lethal <- !(sol@stat %in% stat & sol@obj >= 1e-7)  
         lethal.dt <- rbind(lethal.dt, data.table(nr=i, rxn=paste0(model@react_id[na.omit(rxn.idx)],collapse=","), lethal=lethal))
       } else lethal.dt <- rbind(lethal.dt, data.table(nr=i, rxn="", lethal=NA))
     }else lethal.dt <- rbind(lethal.dt, data.table(nr=i, rxn="", lethal=NA))
@@ -154,9 +153,9 @@ add_growth <- function(model.orig, add.met.id=NA, weights=NA, genes=NA, verbose=
   mod.adapt    <- esp.mode(mod.adapt, media)
 
   sol <- fba(mod.adapt)
-  #cat("Check growth: ", sol@stat==ok, "\t", sol@obj, "\n")
+  #cat("Check growth: ", sol@stat %in% stat, "\t", sol@obj, "\n")
 
-  if(sol@stat == ok & sol@obj >= 1e-7){
+  if(sol@stat %in% stat & sol@obj >= 1e-7){
     warning(paste("Model is already growing with", add.met.id, add.met.name))
     if( verbose ){
       cat("solution", sol$obj, "\n")
@@ -190,7 +189,7 @@ add_growth <- function(model.orig, add.met.id=NA, weights=NA, genes=NA, verbose=
   #check_growth(mod.adapt.lst$model, medium = c(media$compounds), printExchanges = T, useNames = T, mtf = T)
 
   sol <- fba(mod.adapt)
-  #cat(sol@stat==ok, "\t", sol@obj, "\n")
+  #cat(sol@stat %in% stat, "\t", sol@obj, "\n")
   cat("\n")
 
   check <- check_lethal(mod.adapt, rxn_list = setdiff(mod.adapt@react_id, model.orig@react_id), med = media)
@@ -241,8 +240,8 @@ rm_growth <- function(model.orig, del.met.id, use.media=NA, rxn.blast.file, verb
 
   model <- esp.mode(model.orig, media)
   sol <- fba(model)
-  #cat(sol@stat==ok, "\t", sol@obj, "\n")
-  if( !(sol@stat==ok & sol@obj >= 1e-7 ) ){
+  #cat(sol@stat %in% stat, "\t", sol@obj, "\n")
+  if( !(sol@stat %in% stat & sol@obj >= 1e-7 ) ){
     warning(paste("Model already cannot grow with:", del.met.id, del.met.name))
     return(invisible(model.orig))
   }else{
@@ -325,9 +324,9 @@ check_theo_growth <- function(check.met.id, fullmod){
   mod.adapt    <- constrain.model(mod.adapt, media=media)
 
   sol <- pfbaHeuristic(mod.adapt)
-  cat("Check growth: ", sol@stat==ok, "\t", sol@obj, "\n")
+  cat("Check growth: ", sol@stat %in% stat, "\t", sol@obj, "\n")
 
-  if(sol@stat == ok & sol@obj >= 1e-7){
+  if(sol@stat %in% stat & sol@obj >= 1e-7){
     print("Growth possible")
   }else{
     print("No growth possible")
@@ -363,7 +362,7 @@ increase_growth <- function(model.orig, min.obj.val, weights=NA, genes=NA, verbo
 
   sol <- fba(model.orig)
   cat("Old growth rate:", round(sol@obj,6), "\n")
-  if(sol@stat == ok & sol@obj >= min.obj.val){
+  if(sol@stat %in% stat & sol@obj >= min.obj.val){
     warning(paste("Model already has growth rate >=",min.obj.val))
     return(invisible(model.orig))
   }else{

diff --git a/src/gf.suite.R b/src/gf.suite.R
@@ -50,10 +50,11 @@ suppressMessages(library(tools))
 
 # select solver
 if( "cobrarCPLEX" %in% rownames(installed.packages()) ){
-  COBRAR_SETTINGS("SOLVER","cplex"); ok <- 1
+  COBRAR_SETTINGS("SOLVER","cplex"); stat <- c(1,2)
 }else{
-  COBRAR_SETTINGS("SOLVER","glpk"); ok <- 0
+  COBRAR_SETTINGS("SOLVER","glpk"); stat <- c(2,5)
 }
+# COBRAR_SETTINGS("SOLVER","glpk"); stat <- c(2,5)
 cat("LP solver:",COBRAR_SETTINGS("SOLVER"),"\n")
 
 # Setting defaults if required
@@ -99,6 +100,12 @@ min.obj.val         <- opt$min.obj.val
 dummy.weight <- 100
 sbml.export  <- FALSE 
 
+# check if minimum required growth rate is valid
+if(min.obj.val < 0.001) {
+  warning("Minimum required growth rate ('-k') should not be smaller than 0.001. Resetting value to 0.001.")
+  min.obj.val <- 0.001
+}
+
 # create output directory if not already there
 dir.create(output.dir, recursive = TRUE, showWarnings = FALSE)
 if (!dir.exists(output.dir) || file.access(output.dir, mode = 2) == -1)
@@ -331,7 +338,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
 
     sol <- fba(mod.fill2)
 
-    if(sol@ok == ok & sol@obj >= 1e-6){
+    if(sol@stat %in% stat & sol@obj >= 1e-6){
       mod.fill2@obj_coef <- rep(0,react_num(mod.fill2))
     }else{
       if( verbose ) cat("\nTry to gapfill", bm.met.name[i],"\n")
@@ -368,11 +375,10 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
 
   cat("\rGapfill summary:\n")
   cat("Filled components:    ",mod.fill2.counter, "(",paste(mod.fill2.names, collapse = ","),")\n")
-  cat("Added reactions:      ",length(mod.fill2@react_id)-length(mod.fill1@react_id),"\n")
+  cat("Added reactions:      ",length(mod.fill2@react_id[!grepl("^EX_|^DM",mod.fill2@react_id) & mod.fill2@react_id %notin% mod.fill1@react_id]),"\n")
   cat("Final growth rate:    ",fba(mod.fill2)@obj,"\n")
 
 
-
   media2 <- fread(paste0(script.dir,"/../dat/media/MM_glu.csv")) # load minimal medium and add available carbon sources
   if( nrow(fread(media.file, header=F)[V1=="cpd00007" & V3!=0]) ){
     cat("\n\n2b. Anaerobic biomass gapfilling using core reactions only\n")
@@ -424,7 +430,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
 
     sol <- fba(mod.fill2)
 
-    if(sol@ok == ok & sol@obj >= 1e-6){
+    if(sol@stat %in% stat & sol@obj >= 1e-6){
       mod.fill2@obj_coef <- rep(0,react_num(mod.fill2))
     }else{
       if( verbose ) cat("\nTry to gapfill", bm.met.name[i],"\n")
@@ -461,7 +467,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
 
   cat("\rGapfill summary:\n")
   cat("Filled components:    ",mod.fill2.counter, "(",paste(mod.fill2.names, collapse = ","),")\n")
-  cat("Added reactions:      ",length(mod.fill2@react_id)-length(mod.orig2@react_id),"\n")
+  cat("Added reactions:      ",length(mod.fill2@react_id[!grepl("^EX_|^DM",mod.fill2@react_id) & mod.fill2@react_id %notin% mod.fill1@react_id]),"\n")
   cat("Final growth rate:    ",fba(mod.fill2)@obj,"\n")
 
 
@@ -474,7 +480,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
   exchanges.new.met  <- str_replace(str_remove(carbon.source$seed[idx], "EX_"),"_e0","\\[e0\\]")
   exchanges.new.name <- mod@met_name[match(exchanges.new.met,mod@met_id)]
   exchanges.new.ids  <- carbon.source$seed[idx]
-  exchanges.new.used  <- rep(FALSE, length(exchanges.new.ids))  # delete unused addionally added exchange reactions later
+  exchanges.new.used  <- rep(FALSE, length(exchanges.new.ids))  # delete unused additionally added exchange reactions later
   mod.out       <- add_exchanges(mod.out, exchanges.new.met, metname=exchanges.new.name)
 
   if ( !quick.gf ){
@@ -539,7 +545,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
 
       sol <- fba(mod.fill3)
 
-      if(sol@ok == ok & sol@obj >= 1e-7){
+      if(sol@stat %in% stat & sol@obj >= 1e-7){
         #mod.fill3@obj_coef <- rep(0,react_nummod.fill3))
         src.status <- TRUE
       }else{
@@ -577,7 +583,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
     mod.out <- mod.fill3
     cat("\rGapfill summary:\n")
     cat("Filled components:    ",mod.fill3.counter, "(",paste(mod.fill3.names, collapse = ","),")\n")
-    cat("Added reactions:      ",length(mod.fill3@react_id)-length(mod.fill2@react_id),"\n")
+    cat("Added reactions:      ",length(mod.fill3@react_id[!grepl("^EX_|^DM",mod.fill3@react_id) & mod.fill3@react_id %notin% mod.fill2@react_id]),"\n")
     cat("Final growth rate:    ",fba(mod.fill3)@obj,"\n")
   }
 
@@ -626,7 +632,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
 
       sol <- fba(mod.fill4)
 
-      if(sol@ok == ok & sol@obj >= 1e-7){
+      if(sol@stat %in% stat & sol@obj >= 1e-7){
         #mod.fill4@obj_coef <- rep(0,react_num(mod.fill4))
         src.status <- TRUE
       }else{
@@ -673,7 +679,7 @@ if(nrow(mseed.t)>0) { # Skip steps 2,2b,3, and 4 if core-reaction list does not
 
     cat("\rGapfill summary:\n")
     cat("Filled components:    ",mod.fill4.counter, "(",paste(mod.fill4.names, collapse = ","),")\n")
-    cat("Added reactions:      ",length(mod.fill4@react_id)-length(mod.fill3@react_id),"\n")
+    cat("Added reactions:      ",length(mod.fill4@react_id[!grepl("^EX_|^DM",mod.fill4@react_id) & mod.fill4@react_id %notin% mod.fill3@react_id]),"\n")
     cat("Final growth rate:    ",mod.fill4.sol@fluxes[which(mod.fill4@obj_coef==1)],"\n\n")
 
     cat("Uptake at limit:\n")
@@ -690,7 +696,7 @@ mod.out <- add_missing_exchanges(mod.out)
 # delete unused additionally added exchange reactions later
 exchanges.rm <- exchanges.new.ids[!exchanges.new.used]
 if( length(exchanges.rm) > 0 )
-mod.out <- rmReact(mod.out, react=exchanges.rm)
+  mod.out <- rmReact(mod.out, react=exchanges.rm)
 mod.out <- rm_unused_exchanges(mod.out)
 
 

diff --git a/src/pan-draft.R b/src/pan-draft.R
@@ -63,14 +63,6 @@ suppressMessages(library(methods))
 # Little helpers
 source(paste0(script.dir,"/pan-draft_functions.R"))
 
-# select solver
-if( "cobrarCPLEX" %in% rownames(installed.packages()) ){
-  COBRAR_SETTINGS("SOLVER","cplex"); ok <- 1
-}else{
-  warning("glpk is used as LP solver but cplex is recommended because it is much faster. If you have IBM's ILOG cplex installed, you can add cplex-support to gapseq by installing the R-package 'cobarCPLEX' (https://github.com/Waschina/cobrarCPLEX).")
-  COBRAR_SETTINGS("SOLVER","glpk"); ok <- 0
-}
-
 # Setting defaults if required
 if ( is.null(opt$output.dir) ) { opt$output.dir = "." }
 if ( is.null(opt$min.rxn.freq.in.mods) ) { opt$min.rxn.freq.in.mods = 0.06 }