diff --git a/config/config_default.ini b/config/config_default.ini
index da8d0e1..9002d56 100644
--- a/config/config_default.ini
+++ b/config/config_default.ini
@@ -63,12 +63,12 @@ DEPHASING_T[1] =
 DEPHASING_T[2] =
 ; Apply gradient in T/m for each axis. Gradients are active for one DWELL_TIME
 GRADIENT_XYZ[0] = 0.0 0.0 0.0
-GRADIENT_XYZ[1] = 0.0 0.0 0.0
-GRADIENT_XYZ[2] = 0.0 0.0 0.0
+GRADIENT_XYZ[1] = 
+GRADIENT_XYZ[2] = 
 ; Time (sec) to apply gradient.
 GRADIENT_T[0] = 0.0
-GRADIENT_T[1] = 0.0
-GRADIENT_T[2] = 0.0
+GRADIENT_T[1] =
+GRADIENT_T[2] = 
 
 
 DWELL_TIME  = 50e-6
diff --git a/src/file_utils.h b/src/file_utils.h
index e80ba33..26305cd 100644
--- a/src/file_utils.h
+++ b/src/file_utils.h
@@ -110,6 +110,9 @@ bool read_config(std::string config_filename, simulation_parameters& param, std:
             for (uint16_t i = 0; i<param.n_fieldmaps && ini.get("files").has(it->first + "[" + std::to_string(i) + "]"); i++)
             {
                 it->second[i] = ini.get("files").get(it->first + "[" + std::to_string(i) + "]");
+                if (std::all_of(it->second[i].begin(), it->second[i].end(), isspace))
+                    break; // if it is empty, break the loop
+                    
                 std::filesystem::path f(it->second[i]);   
                 if (f.is_relative())
                 {   // if it is relative, make it absolute
@@ -131,7 +134,6 @@ bool read_config(std::string config_filename, simulation_parameters& param, std:
             }    
     }
 
-
     // ============== reading section SCAN_PARAMETERS ==============
     if(ini.has("SCAN_PARAMETERS"))
     {
@@ -192,16 +194,22 @@ bool read_config(std::string config_filename, simulation_parameters& param, std:
             std::cout << ERR_MSG << "RF Times must be in ascending order, starts with 0 and must not have duplicates values" << std::endl;
             return false;
         }
-
+        
         // ---------------- dephasing (start times, Flip angles ) ----------------
         // Dephase start times
-        for(i=0; i<MAX_RF && ini.get("SCAN_PARAMETERS").has("DEPHASING_T[" + std::to_string(i) + "]"); i++)       
-            param.dephasing_T[i] = std::stof(ini.get("SCAN_PARAMETERS").get("DEPHASING_T[" + std::to_string(i) + "]")) / param.dt;    
+        for(i=0; i<MAX_RF && ini.get("SCAN_PARAMETERS").has("DEPHASING_T[" + std::to_string(i) + "]"); i++)  
+        {   // here we use try-catch since this field can be with empty value in the config file
+            try{param.dephasing_T[i] = std::stof(ini.get("SCAN_PARAMETERS").get("DEPHASING_T[" + std::to_string(i) + "]")) / param.dt;} 
+            catch(const std::exception& e){break;}
+        }
  
         param.n_dephasing = i;   
         // Dephase flip angles
         for(i=0; i<param.n_dephasing && ini.get("SCAN_PARAMETERS").has("DEPHASING[" + std::to_string(i) + "]"); i++)
-            param.dephasing[i] = std::stof(ini.get("SCAN_PARAMETERS").get("DEPHASING[" + std::to_string(i) + "]")) ;
+        {   
+            try{param.dephasing[i] = std::stof(ini.get("SCAN_PARAMETERS").get("DEPHASING[" + std::to_string(i) + "]")) ;}
+            catch(const std::exception& e){break;}
+        }
         
         if(i != param.n_dephasing)
         {
@@ -217,16 +225,26 @@ bool read_config(std::string config_filename, simulation_parameters& param, std:
             std::cout << ERR_MSG << "dephasing Times must be in ascending order and must not have duplicates values" << std::endl;
             return false;
         } 
-            
+  
         // ---------------- Gradients (start times, strength (T/m) ) ----------------
         // Gradient start times
-        for(i=0; i<MAX_GRADIENT && ini.get("SCAN_PARAMETERS").has("GRADIENT_T[" + std::to_string(i) + "]"); i++)        
-            param.gradient_T[i] = std::stof(ini.get("SCAN_PARAMETERS").get("GRADIENT_T[" + std::to_string(i) + "]")) / param.dt;    
+        for(i=0; i<MAX_GRADIENT && ini.get("SCAN_PARAMETERS").has("GRADIENT_T[" + std::to_string(i) + "]"); i++)  
+        {   // here we use try-catch since this field can be with empty value in the config file   
+            try{param.gradient_T[i] = std::stof(ini.get("SCAN_PARAMETERS").get("GRADIENT_T[" + std::to_string(i) + "]")) / param.dt; }
+            catch(const std::exception& e){break;}  
+        }
 
         param.n_gradient = i;
         // Gradient strength
         for(i=0; i<param.n_gradient && ini.get("SCAN_PARAMETERS").has("GRADIENT_XYZ[" + std::to_string(i) + "]"); i++)
-            param.gradient_xyz[i] = std::stof(ini.get("SCAN_PARAMETERS").get("GRADIENT_XYZ[" + std::to_string(i) + "]")) ;
+        {
+            try{
+                std::istringstream iss(ini.get("SCAN_PARAMETERS").get("GRADIENT_XYZ[" + std::to_string(i) + "]"));
+                int ind = i*3;
+                while (iss >> *(param.gradient_xyz+ind++) && ind < (i+1)*3);
+            }
+            catch(const std::exception& e){break;}
+        }
                
         if(i != param.n_gradient)
         {
@@ -234,7 +252,7 @@ bool read_config(std::string config_filename, simulation_parameters& param, std:
             return false;
         } 
 
-        // check Dephase start time conditions
+        // check Gradient start time conditions
         if (std::is_sorted(param.gradient_T, param.gradient_T + i) == false || 
             std::adjacent_find(param.gradient_T, param.gradient_T + i) != param.gradient_T + i )
         {
diff --git a/src/kernels.cu b/src/kernels.cu
index a77acbf..2008a04 100644
--- a/src/kernels.cu
+++ b/src/kernels.cu
@@ -14,16 +14,14 @@
 #include "rotation.cuh"
 #include "helper_cuda.h"
 
-#define GAMMA  267515315. // rad/s.T
-
 
 //---------------------------------------------------------------------------------------------
 //  
 //---------------------------------------------------------------------------------------------
 
-__global__ void cu_sim(const simulation_parameters *param, const float *pFieldMap, const bool *pMask, const float *M0, const float *XYZ0, float *M1, float *XYZ1, uint32_t spin_no)
+__global__ void cu_sim(const simulation_parameters *param, const float *pFieldMap, const bool *pMask, const float *M0, const float *XYZ0, float *M1, float *XYZ1)
 {
-    spin_no = blockIdx.x * blockDim.x + threadIdx.x ;
+    uint32_t spin_no = blockIdx.x * blockDim.x + threadIdx.x ;
     if (spin_no >= param->n_spins)
         return;
     
@@ -69,8 +67,8 @@ __global__ void cu_sim(const simulation_parameters *param, const float *pFieldMa
         // ------ loop over timepoints ------
         uint64_t ind=0, ind_old=param->matrix_length+1;
         uint32_t current_timepoint = 0, old_timepoint = 0;
-        uint16_t current_rf = 1, current_te = 0, counter_dephasing = 0;
-        float accumulated_phase = 0.f, dephase_deg = 0.f;        
+        uint16_t current_rf = 1, current_te = 0, counter_dephasing = 0, counter_gradient = 0;
+        float accumulated_phase = 0.f;        
         while (current_timepoint < param->n_timepoints) // param->n_timepoints is the total number of timepoints (= TR/dwelltime)
         {
             // ------ generate random walks and wrap around the boundries ------
@@ -100,18 +98,24 @@ __global__ void cu_sim(const simulation_parameters *param, const float *pFieldMa
             // ------ apply dephasing if there is any ------
             if(counter_dephasing < param->n_dephasing && param->dephasing_T[counter_dephasing] == current_timepoint)
             {
-                float dephase_deg  =  (float)spin_no * param->dephasing[counter_dephasing] / (float)param->n_spins; // assign dephasing linearly to spins 
-                accumulated_phase +=  dephase_deg / (param->B0 * GAMMA * param->dt * RAD2DEG); // scale dephasing to Tesla per dt
+                accumulated_phase += (float)spin_no * param->dephasing[counter_dephasing] / (float)param->n_spins; // assign dephasing linearly to spins 
                 counter_dephasing++;
             }
+
+            // ------ apply gradient if there is any ------
+            if(counter_gradient < param->n_gradient && param->gradient_T[counter_gradient] == current_timepoint)
+            {
+                const float *Gxyz = param->gradient_xyz + 3*counter_gradient;
+                accumulated_phase +=  (Gxyz[0]*xyz_new[0] + Gxyz[1]*xyz_new[1] + Gxyz[2]*xyz_new[2]) * param->dt*GAMMA*RAD2DEG; //  Gx * x + Gy * y + Gz * z
+                counter_gradient++;
+            }
                  
 
             // ------ apply other RF pulse if there is any ------
             if(current_rf < param->n_RF && param->RF_ST[current_rf] == current_timepoint)
             {
                 // dephase                
-                dephase_deg =  accumulated_phase * param->B0 * GAMMA * param->dt * RAD2DEG; // convert accumulated phase to degree
-                zrot(dephase_deg, m0, m1); 
+                zrot(accumulated_phase, m0, m1); 
                 // relax
                 time_elapsed = (current_timepoint - old_timepoint) * param->dt;
                 relax(exp(-time_elapsed/param->T1), exp(-time_elapsed/param->T2), m1);
@@ -126,8 +130,7 @@ __global__ void cu_sim(const simulation_parameters *param, const float *pFieldMa
             if (is_lastscan && current_te < param->n_TE && param->TE[current_te] == current_timepoint)
             {
                 // dephase                  
-                dephase_deg =  accumulated_phase * param->B0 * GAMMA * param->dt * RAD2DEG; // convert accumulated phase to degree  
-                zrot(dephase_deg, m0, m1);
+                zrot(accumulated_phase, m0, m1);
                 // relax                    
                 time_elapsed = (current_timepoint - old_timepoint) * param->dt;
                 relax(exp(-time_elapsed/param->T1), exp(-time_elapsed/param->T2), m1);
@@ -146,8 +149,7 @@ __global__ void cu_sim(const simulation_parameters *param, const float *pFieldMa
             current_timepoint++;            
         }
         // dephase
-        dephase_deg =  accumulated_phase * param->B0 * GAMMA * param->dt * RAD2DEG; // convert accumulated phase to degree     
-        zrot(dephase_deg, m0, m1);
+        zrot(accumulated_phase, m0, m1);
         // relax
         time_elapsed = (current_timepoint - old_timepoint) * param->dt;
         relax(exp(-time_elapsed/param->T1), exp(-time_elapsed/param->T2), m1);
@@ -166,18 +168,27 @@ __global__ void cu_sim(const simulation_parameters *param, const float *pFieldMa
 //  
 //---------------------------------------------------------------------------------------------
 
-__global__ void cu_scalePos(float *scaled_xyz, float *initial_xyz, float scale, uint32_t size)
+__global__ void cu_scalePos(float *scaled_xyz, float *initial_xyz, float scale, uint64_t size)
 {
-    int n = blockIdx.x * blockDim.x + threadIdx.x ;
+    uint64_t n = blockIdx.x * blockDim.x + threadIdx.x ;
     if(n < size)
     {
-        uint32_t ind = 3*n;
+        uint64_t ind = 3*n;
         scaled_xyz[ind+0] = initial_xyz[ind+0] * scale;
         scaled_xyz[ind+1] = initial_xyz[ind+1] * scale;
         scaled_xyz[ind+2] = initial_xyz[ind+2] * scale;
     }
 }
 
+//---------------------------------------------------------------------------------------------
+// CUDA kernel to perform array multiplication with a constant
+//---------------------------------------------------------------------------------------------
+__global__ void cu_scaleArray(float *array, float scale, uint64_t size)
+{
+    uint64_t n = blockIdx.x * blockDim.x + threadIdx.x ;
+    if(n < size)
+        array[n] *= scale;
+}
 
 //---------------------------------------------------------------------------------------------
 // generate random initial position
diff --git a/src/kernels.cuh b/src/kernels.cuh
index 044242e..9b96d4d 100644
--- a/src/kernels.cuh
+++ b/src/kernels.cuh
@@ -16,10 +16,13 @@
 
 #define GAMMA  267515315. // rad/s.T
 
-__global__ void cu_sim(const simulation_parameters *param, const float *pFieldMap, const bool *pMask, const float *M0, const float *XYZ0, float *M1, float *XYZ1, uint32_t spin_no = 0);
+__global__ void cu_sim(const simulation_parameters *param, const float *pFieldMap, const bool *pMask, const float *M0, const float *XYZ0, float *M1, float *XYZ1);
 
 // scale position to mimic the different volume size
-__global__ void cu_scalePos(float *scaled_xyz, float *initial_xyz, float scale, uint32_t size);
+__global__ void cu_scalePos(float *scaled_xyz, float *initial_xyz, float scale, uint64_t size);
+
+// CUDA kernel to perform array multiplication with a constant
+__global__ void cu_scaleArray(float *array, float scale, uint64_t size);
 
 // generate random initial position
 __global__ void cu_randPosGen(float *spin_position_xyz, simulation_parameters *param, bool *pMask, uint32_t spin_no = 0);
@@ -46,7 +49,7 @@ uint32_t is_masked(std::vector<T> &XYZ0, std::vector<char> &mask, simulation_par
     {
         uint64_t index = sub2ind(ROUND(XYZ0[3*i] * scale2grid[0]), ROUND(XYZ0[3*i+1] * scale2grid[1]), ROUND(XYZ0[3*i+2] * scale2grid[2]), param->fieldmap_size[0], param->fieldmap_size[1]);
         if (index >= mask.size())
-            std::cout << ERR_MSG << "Index out of range: " << index << " >= " << mask.size() << std::endl;
+            std::cout << ERR_MSG << "For "<<i<< "th element, index is out of range: " << index << " >= " << mask.size() << std::endl;
         mask_counter[i] = mask[index];
     }
     return std::accumulate(mask_counter.begin(), mask_counter.end(), 0);
diff --git a/src/miscellaneous.h b/src/miscellaneous.h
index c824a36..de2765b 100644
--- a/src/miscellaneous.h
+++ b/src/miscellaneous.h
@@ -77,8 +77,8 @@ typedef struct simulation_parameters
         std::cout<<"RF time = "; for(int i=0; i<n_RF; i++) std::cout<<RF_ST[i]*dt<<' '; std::cout<<'\n';
         std::cout<<"dephasing degree = "; for(int i=0; i<n_dephasing; i++) std::cout<<dephasing[i]<<' '; std::cout<<'\n';
         std::cout<<"dephasing time = "; for(int i=0; i<n_dephasing; i++) std::cout<<dephasing_T[i]*dt<<' '; std::cout<<'\n';
-        std::cout<<"gradient T/m = "; for(int i=0; i<3*n_gradient; i++) std::cout<<gradient_xyz[i]<<' '; std::cout<<'\n';
-        std::cout<<"gradient time = "; for(int i=0; i<n_gradient; i++) std::cout<<gradient_T[i]<<' '; std::cout<<'\n';
+        std::cout<<"gradient (x,y,z) T/m =\n"; for(int i=0; i<n_gradient; i++) std::cout<<gradient_xyz[3*i+0]<<' '<<gradient_xyz[3*i+1]<<' '<<gradient_xyz[3*i+2]<<'\n';
+        std::cout<<"gradient time = "; for(int i=0; i<n_gradient; i++) std::cout<<gradient_T[i]*dt<<' '; std::cout<<'\n';
         std::cout<<"sample length = "<< sample_length[0] << " x " << sample_length[1] << " x " << sample_length[2] << " m" << '\n';
         std::cout<<"scale2grid = "<< scale2grid[0] << " x " << scale2grid[1] << " x " << scale2grid[2] << '\n';
         std::cout<<"fieldmap size = "<< fieldmap_size[0] << " x " << fieldmap_size[1] << " x " << fieldmap_size[2] << '\n';
diff --git a/src/spinwalk.cu b/src/spinwalk.cu
index 2a3439e..254685d 100644
--- a/src/spinwalk.cu
+++ b/src/spinwalk.cu
@@ -21,8 +21,8 @@
 #define THREADS_PER_BLOCK  64
 
 #define SPINWALK_VERSION_MAJOR 1
-#define SPINWALK_VERSION_MINOR 2
-#define SPINWALK_VERSION_PATCH 5
+#define SPINWALK_VERSION_MINOR 3
+#define SPINWALK_VERSION_PATCH 0
 
 using namespace std;
 
@@ -109,7 +109,7 @@ bool simulate(simulation_parameters param, std::map<std::string, std::vector<std
 
             checkCudaErrors(cudaSetDevice(d));            
             checkCudaErrors(cudaStreamCreate(&streams[d]));
-
+            // allocate memory on GPU
             checkCudaErrors(cudaMalloc((void**)&d_param[d],         sizeof(simulation_parameters)));
             checkCudaErrors(cudaMalloc((void**)&d_pFieldMap[d],     sizeof(fieldmap[0]) * fieldmap.size()));   
             checkCudaErrors(cudaMalloc((void**)&d_pMask[d],         sizeof(mask[0]) * mask.size())); 
@@ -118,12 +118,14 @@ bool simulate(simulation_parameters param, std::map<std::string, std::vector<std
             checkCudaErrors(cudaMalloc((void**)&d_XYZ1[d],          sizeof(float) * 3 * param.n_spins));
             checkCudaErrors(cudaMalloc((void**)&d_M0[d],            sizeof(float) * 3 * param.n_spins));
             checkCudaErrors(cudaMalloc((void**)&d_M1[d],            sizeof(float) * 3 * param.n_TE * param.n_spins));
-            
+            // copy data to GPU
             checkCudaErrors(cudaMemcpyAsync(d_pFieldMap[d], fieldmap.data(),        fieldmap.size()*sizeof(fieldmap[0]), cudaMemcpyHostToDevice, streams[d]));
             checkCudaErrors(cudaMemcpyAsync(d_pMask[d],     mask.data(),            mask.size() * sizeof(mask[0]),       cudaMemcpyHostToDevice, streams[d]));
             checkCudaErrors(cudaMemcpyAsync(d_param[d],     &param_local,           sizeof(simulation_parameters),       cudaMemcpyHostToDevice, streams[d]));
             checkCudaErrors(cudaMemcpyAsync(d_M0[d],        &M0[3*param.n_spins*d], 3*param.n_spins*sizeof(M0[0]),       cudaMemcpyHostToDevice, streams[d]));
-            
+
+            // convert fieldmap from Tesla to degree per dwell time
+            cu_scaleArray<<<uint64_t(fieldmap.size()/THREADS_PER_BLOCK)+1, THREADS_PER_BLOCK, 0, streams[d]>>>(d_pFieldMap[d], param.B0*GAMMA*param.dt*RAD2DEG, fieldmap.size());
             if(hasXYZ0 == false)
             {   // generate initial spatial position for spins, based on sample_length_ref
                 printf("GPU %d) Generating random initial position for spins... ", d);
@@ -132,7 +134,8 @@ bool simulate(simulation_parameters param, std::map<std::string, std::vector<std
                 printf("Done!\n");
             }
             else // copy initial spatial position and magnetization for spins
-                checkCudaErrors(cudaMemcpyAsync(d_XYZ0[d], &XYZ0[3*param.n_spins*d], 3*param.n_spins*sizeof(XYZ0[0]), cudaMemcpyHostToDevice, streams[d]));      
+                checkCudaErrors(cudaMemcpyAsync(d_XYZ0[d], &XYZ0[3*param.n_spins*d], 3*param.n_spins*sizeof(XYZ0[0]), cudaMemcpyHostToDevice, streams[d]));  
+            checkCudaErrors(cudaStreamSynchronize(streams[d]));    
         }
 
         // ========== run ==========        
@@ -208,7 +211,7 @@ bool simulate(simulation_parameters param, std::map<std::string, std::vector<std
         std::cout << std::string(50, '=') << std::endl;
     }
     return true;
-}
+} 
 
 
 int main(int argc, char * argv[])
@@ -216,13 +219,15 @@ int main(int argc, char * argv[])
     std::cout << "SpinWalk ver. " << SPINWALK_VERSION_MAJOR << "." << SPINWALK_VERSION_MINOR << "." << SPINWALK_VERSION_PATCH << std::endl;
     // ========== parse command line arguments ==========
     std::vector<std::string> config_files;    
-    bool bVerbose = false, bHelp = false;
+    bool bVerbose = false, bHelp = false, bNoSim = false;
     for(uint8_t i=1; i<argc; i++)
     {
         if (strcmp(argv[i], "-v") == 0)
             bVerbose = true;
         else if (strcmp(argv[i], "-h") == 0)
             bHelp = true;
+        else if (strcmp(argv[i], "-n") == 0)
+            bNoSim = true;
         else
             config_files.push_back(argv[i]);
     }
@@ -233,9 +238,10 @@ int main(int argc, char * argv[])
         std::cout << "Usage: " << argv[0] << " -options <config_file1> <config_file2> ... <config_filen>" << std::endl;
         std::cout << "Options:" << std::endl;
         std::cout << "  -v: verbose" << std::endl;  
+        std::cout << "  -n: only read config" << std::endl;  
         std::cout << "  -h: help (this menu)" << std::endl;      
         print_device_info();
-        return 1;
+        return 1;  
     }
 
     std::cout << "Running simulation for " << config_files.size() << " config(s)..." << std::endl;
@@ -286,6 +292,8 @@ int main(int argc, char * argv[])
             std::cout<< std::string(50, '=')  << std::endl;
         }
 
+        if (bNoSim)
+            continue;
         if(simulate(param, filenames, sample_length_scales) == false)
             return 1;
     }