From 85b6866856c72b7cb46d560611c2db2f086d1013 Mon Sep 17 00:00:00 2001
From: Andrew Alferman <alfermaa@oregonstate.edu>
Date: Wed, 2 May 2018 11:28:33 -0700
Subject: [PATCH 1/2] Added the Oregonator test problem.

---
 examples/oregonator/SConscript            |  23 ++++
 examples/oregonator/dydt.c                |  43 +++++++
 examples/oregonator/dydt.cu               |  47 ++++++++
 examples/oregonator/dydt.cuh              |  34 ++++++
 examples/oregonator/dydt.h                |  30 +++++
 examples/oregonator/gpu_macros.cuh        |  41 +++++++
 examples/oregonator/gpu_memory.cu         |  68 +++++++++++
 examples/oregonator/gpu_memory.cuh        |  66 +++++++++++
 examples/oregonator/header.cuh            |  56 +++++++++
 examples/oregonator/header.h              |  62 ++++++++++
 examples/oregonator/ics.c                 |  58 ++++++++++
 examples/oregonator/ics.cu                |  58 ++++++++++
 examples/oregonator/jacob.c               |  82 +++++++++++++
 examples/oregonator/jacob.cu              |  47 ++++++++
 examples/oregonator/jacob.cuh             |  28 +++++
 examples/oregonator/jacob.h               |  32 ++++++
 examples/oregonator/launch_bounds.cuh     |  16 +++
 examples/oregonator/oregonator.dox        | 133 ++++++++++++++++++++++
 examples/oregonator/plotter.py            |  39 +++++++
 examples/oregonator/sparse_multiplier.c   |  31 +++++
 examples/oregonator/sparse_multiplier.cu  |  31 +++++
 examples/oregonator/sparse_multiplier.cuh |  32 ++++++
 examples/oregonator/sparse_multiplier.h   |  25 ++++
 23 files changed, 1082 insertions(+)
 create mode 100644 examples/oregonator/SConscript
 create mode 100644 examples/oregonator/dydt.c
 create mode 100644 examples/oregonator/dydt.cu
 create mode 100644 examples/oregonator/dydt.cuh
 create mode 100644 examples/oregonator/dydt.h
 create mode 100644 examples/oregonator/gpu_macros.cuh
 create mode 100644 examples/oregonator/gpu_memory.cu
 create mode 100644 examples/oregonator/gpu_memory.cuh
 create mode 100644 examples/oregonator/header.cuh
 create mode 100644 examples/oregonator/header.h
 create mode 100644 examples/oregonator/ics.c
 create mode 100644 examples/oregonator/ics.cu
 create mode 100644 examples/oregonator/jacob.c
 create mode 100644 examples/oregonator/jacob.cu
 create mode 100644 examples/oregonator/jacob.cuh
 create mode 100644 examples/oregonator/jacob.h
 create mode 100644 examples/oregonator/launch_bounds.cuh
 create mode 100644 examples/oregonator/oregonator.dox
 create mode 100644 examples/oregonator/plotter.py
 create mode 100644 examples/oregonator/sparse_multiplier.c
 create mode 100644 examples/oregonator/sparse_multiplier.cu
 create mode 100644 examples/oregonator/sparse_multiplier.cuh
 create mode 100644 examples/oregonator/sparse_multiplier.h

diff --git a/examples/oregonator/SConscript b/examples/oregonator/SConscript
new file mode 100644
index 0000000..f977215
--- /dev/null
+++ b/examples/oregonator/SConscript
@@ -0,0 +1,23 @@
+Import('env')
+cObj = []
+c_src = Glob('*.c')
+import os
+
+FD = env['FINITE_DIFFERENCE']
+
+for src in c_src:
+    if FD and 'jacob' in str(src):
+        continue
+    cObj.append(env.Object(src, variant_dir=env['variant']))
+
+
+cudaObj = []
+if env['build_cuda']:
+    cuda_src = Glob('*.cu')
+
+    for src in cuda_src:
+        if FD and 'jacob' in str(src):
+            continue
+        cudaObj.append(env.CUDAObject(src, variant_dir=env['variant']))
+
+Return ('cObj', 'cudaObj')
\ No newline at end of file
diff --git a/examples/oregonator/dydt.c b/examples/oregonator/dydt.c
new file mode 100644
index 0000000..93c2fa4
--- /dev/null
+++ b/examples/oregonator/dydt.c
@@ -0,0 +1,43 @@
+/**
+ * \file
+ * \brief An implementation of the Oregonator right hand side (y' = f(y)) function.
+ *
+ * Implements the evaluation of the derivative of the state vector with respect to time, \f$\dot{\vec{y}}\f$
+ *
+ */
+
+#include "header.h"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator {
+#endif
+
+/**
+ * \brief An implementation of the RHS of the Oregonator equation
+ * \param[in]        t         The current system time
+ * \param[in]        mu        Dummy parameter, needed for compatibility only
+ * \param[in]        y         The state vector
+ * \param[out]       dy        The output RHS (dydt) vector
+ *
+ * The `y` and `dy` vectors supplied here are local versions of the global state vectors.
+ * They have been transformed from the global Column major (Fortan) ordering to a local 1-D vector
+ * Hence the vector accesses can be done in a simple manner below, i.e. y[0] -> \f$y_1\f$, y[1] -> \f$y_2\f$, etc.
+ * @see solver_generic.c
+ */
+void dydt (const double t, const double mu, const double * __restrict__ y, double * __restrict__ dy) {
+
+  double s = 77.27;
+  double q = 8.375E-6;
+  double w = 0.161;
+
+  dy[0] = s * (y[0] - y[0] * y[1] + y[1] - (q * y[0]* y[0]));
+  dy[1] = (y[2] - y[1] - y[0] * y[1]) / s;
+  dy[2] = w * (y[0] - y[2]);
+
+} // end dydt
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/dydt.cu b/examples/oregonator/dydt.cu
new file mode 100644
index 0000000..41838fc
--- /dev/null
+++ b/examples/oregonator/dydt.cu
@@ -0,0 +1,47 @@
+/**
+ * \file
+ * \brief A CUDA implementation of the Oregonator right hand side (y' = f(y)) function.
+ *
+ * Implements the CUDA evaluation of the derivative of the state vector with respect to time, \f$\dot{\vec{y}}\f$
+ *
+ */
+
+#include "header.cuh"
+#include "gpu_macros.cuh"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+/**
+ * \brief An implementation of the RHS of the Oregonator equation
+ * \param[in]        t         The current system time
+ * \param[in]        mu        Dummy parameter, needed for compatibility only
+ * \param[in]        y         The state vector
+ * \param[out]       dy        The output RHS (dydt) vector
+ * \param[in]        d_mem     The mechanism_memory struct.  In future versions, this will be used to access the \f$\mu\f$ parameter to have a consistent interface.
+ *
+ * The `y` and `dy` vectors supplied here are the global state vectors.
+ * Hence the vector accesses must be done with the global thread ID.
+ * The gpu_macros.cuh file defines some useful macros to simplify indexing
+ * @see gpu_macros.cuh
+ */
+ __device__
+void dydt (const double t, const double mu, const double * __restrict__ y, double * __restrict__ dy,
+           const mechanism_memory * __restrict__ d_mem) {
+
+   double s = 77.27;
+   double q = 8.375E-6;
+   double w = 0.161;
+
+   dy[INDEX(0)] = s * (y[INDEX(0)] - y[INDEX(0)] * y[INDEX(1)] + y[INDEX(1)] - (q * y[INDEX(0)]* y[INDEX(0)]));
+   dy[INDEX(1)] = (y[INDEX(2)] - y[INDEX(1)] - y[INDEX(0)] * y[INDEX(1)]) / s;
+   dy[INDEX(2)] = w * (y[INDEX(0)] - y[INDEX(2)]);
+
+} // end dydt
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/dydt.cuh b/examples/oregonator/dydt.cuh
new file mode 100644
index 0000000..8785d1c
--- /dev/null
+++ b/examples/oregonator/dydt.cuh
@@ -0,0 +1,34 @@
+/**
+ * \file
+ * \brief Contains header definitions for the CUDA RHS function for the Oregonator example
+ *
+ */
+
+#ifndef DYDT_CUH
+#define DYDT_CUH
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+
+
+/**
+ * \brief An implementation of the RHS of the Oregonator equation
+ * \param[in]        t         The current system time
+ * \param[in]        mu        Dummy parameter, needed for compatibility only
+ * \param[in]        y         The state vector
+ * \param[out]       dy        The output RHS (dydt) vector
+ * \param[in]        d_mem     The mechanism_memory struct.  In future versions, this will be used to access the \f$\mu\f$ parameter to have a consistent interface.
+ *
+ */
+__device__
+void dydt (const double t, const double mu, const double * __restrict__ y, double * __restrict__ dy,
+           const mechanism_memory * __restrict__ d_mem);
+
+#ifdef GENERATE_DOCS
+}
+#endif
+
+#endif
diff --git a/examples/oregonator/dydt.h b/examples/oregonator/dydt.h
new file mode 100644
index 0000000..b9eae10
--- /dev/null
+++ b/examples/oregonator/dydt.h
@@ -0,0 +1,30 @@
+/**
+ * \file
+ * \brief Contains header definitions for the RHS function for the Oregonator example
+ *
+ */
+
+#ifndef DYDT_H
+#define DYDT_H
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator {
+#endif
+
+
+
+/**
+ * \brief An implementation of the RHS of the Oregonator equation
+ * \param[in]        t         The current system time
+ * \param[in]        mu        Dummy parameter, needed for compatibility only
+ * \param[in]        y         The state vector
+ * \param[out]       dy        The output RHS (dydt) vector
+ */
+void dydt (const double t, const double mu, const double * __restrict__ y, double * __restrict__ dy);
+
+#ifdef GENERATE_DOCS
+}
+#endif
+
+#endif
diff --git a/examples/oregonator/gpu_macros.cuh b/examples/oregonator/gpu_macros.cuh
new file mode 100644
index 0000000..0bd4fac
--- /dev/null
+++ b/examples/oregonator/gpu_macros.cuh
@@ -0,0 +1,41 @@
+/**
+ * \file
+ * \brief Defines some simple macros to simplify GPU indexing
+ *
+ */
+
+#ifndef GPU_MACROS_CUH
+#define GPU_MACROS_CUH
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+//! The total number of threads in the Grid, provides an offset between vector entries
+#define GRID_DIM (blockDim.x * gridDim.x)
+//! The global CUDA thread index
+#define T_ID (threadIdx.x + blockIdx.x * blockDim.x)
+//! Convenience macro to get the value of a vector at index i, calculated as i * #GRID_DIM + #T_ID
+#define INDEX(i) (T_ID + (i) * GRID_DIM)
+
+#define cudaErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
+
+#endif
diff --git a/examples/oregonator/gpu_memory.cu b/examples/oregonator/gpu_memory.cu
new file mode 100644
index 0000000..7cab7d6
--- /dev/null
+++ b/examples/oregonator/gpu_memory.cu
@@ -0,0 +1,68 @@
+/*!
+ * \file
+ * \brief Initializes and calculates required GPU memory
+ */
+
+#include "gpu_memory.cuh"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+/**
+ * \brief Calculates and returns the total memory size (in bytes) required by an individual thread for the
+ *        mechanism_memory struct.
+ */
+size_t required_mechanism_size() {
+  //returns the total required size for the mechanism per thread
+  size_t mech_size = 0;
+  //state vector y
+  mech_size += NSP;
+  //dydt vector
+  mech_size += NSP;
+  //Jacobian
+  mech_size += NSP * NSP;
+  //and mu parameter
+  mech_size += 1;
+  return mech_size * sizeof(double);
+}
+/**
+ * \brief Initializes the host and device mechanism_memory structs.  This is required in order to enable
+ *        passing the struct to CUDA
+ * \param[in]               padded              The padded number of threads to be used by the CUDA solver
+ * \param[in,out]           h_mem               The host version of the mechanism_memory struct to initialize.
+ * \param[in,out]           d_mem               The device version of the mechanism_memory struct to copy the resulting host mechanism_memory struct to.
+ */
+void initialize_gpu_memory(int padded, mechanism_memory** h_mem, mechanism_memory** d_mem)
+{
+  // Allocate storage for the device struct
+  cudaErrorCheck( cudaMalloc(d_mem, sizeof(mechanism_memory)) );
+  //allocate the device arrays on the host pointer
+  cudaErrorCheck( cudaMalloc(&((*h_mem)->y), NSP * padded * sizeof(double)) );
+  cudaErrorCheck( cudaMalloc(&((*h_mem)->dy), NSP * padded * sizeof(double)) );
+  cudaErrorCheck( cudaMalloc(&((*h_mem)->var), 1 * padded * sizeof(double)) );
+  cudaErrorCheck( cudaMalloc(&((*h_mem)->jac), NSP * NSP * padded * sizeof(double)) );
+  // set non-initialized values to zero
+  cudaErrorCheck( cudaMemset((*h_mem)->dy, 0, NSP * padded * sizeof(double)) );
+  cudaErrorCheck( cudaMemset((*h_mem)->jac, 0, NSP * NSP * padded * sizeof(double)) );
+  // and copy to device pointer
+  cudaErrorCheck( cudaMemcpy(*d_mem, *h_mem, sizeof(mechanism_memory), cudaMemcpyHostToDevice) );
+}
+/**
+ * \brief Frees the host and device mechanism_memory structs
+ * \param[in,out]           h_mem               The host version of the mechanism_memory struct.
+ * \param[in,out]           d_mem               The device version of the mechanism_memory struct.
+ */
+void free_gpu_memory(mechanism_memory** h_mem, mechanism_memory** d_mem)
+{
+  cudaErrorCheck(cudaFree((*h_mem)->y));
+  cudaErrorCheck(cudaFree((*h_mem)->dy));
+  cudaErrorCheck(cudaFree((*h_mem)->var));
+  cudaErrorCheck(cudaFree((*h_mem)->jac));
+  cudaErrorCheck(cudaFree(*d_mem));
+}
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/gpu_memory.cuh b/examples/oregonator/gpu_memory.cuh
new file mode 100644
index 0000000..edac909
--- /dev/null
+++ b/examples/oregonator/gpu_memory.cuh
@@ -0,0 +1,66 @@
+/*!
+ * \file
+ * \brief Headers for GPU memory initialization
+ */
+
+<<<<<<< HEAD
+=======
+#ifndef GPU_MEM_CUH
+#define GPU_MEM_CUH
+
+>>>>>>> master
+#include "header.cuh"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+/**
+ * \brief This struct is used to store memory for the CUDA RHS and Jacobian evaluation.
+ *        Along with the solver_memory struct, this must be initialized and passed to the solvers
+ *        the run.  @see solver_memory, initialize_gpu_memory, required_mechanism_size
+ *
+ * \param           y               The global state vector arrays
+ * \param           dy              The global dydt vector arrays
+ * \param           var             The global param array [Used for \f$\mu\f$]
+ * \param           jac             The global Jacobian arrays
+ *
+ * This is also heavily used when using with @pyJac to hold additional arrays
+ * for evaluating chemical kinetic reactions.
+ */
+struct mechanism_memory {
+  double * y;
+  double * dy;
+  double * var;
+  double * jac;
+};
+
+/**
+ * \brief Calculates and returns the total memory size (in bytes) required by an individual thread for the
+ *        mechanism_memory struct.
+ */
+size_t required_mechanism_size();
+/**
+ * \brief Initializes the host and device mechanism_memory structs.  This is required in order to enable
+ *        passing the struct to CUDA
+ * \param[in]               padded              The padded number of threads to be used by the CUDA solver
+ * \param[in,out]           h_mem               The host version of the mechanism_memory struct to initialize.
+ * \param[in,out]           d_mem               The device version of the mechanism_memory struct to copy the resulting host mechanism_memory struct to.
+ */
+void initialize_gpu_memory(int padded, mechanism_memory** h_mem, mechanism_memory** d_mem);
+/**
+ * \brief Frees the host and device mechanism_memory structs
+ * \param[in,out]           h_mem               The host version of the mechanism_memory struct.
+ * \param[in,out]           d_mem               The device version of the mechanism_memory struct.
+ */
+void free_gpu_memory(mechanism_memory** h_mem, mechanism_memory** d_mem);
+
+#ifdef GENERATE_DOCS
+}
+#endif
+<<<<<<< HEAD
+=======
+
+#endif
+>>>>>>> master
diff --git a/examples/oregonator/header.cuh b/examples/oregonator/header.cuh
new file mode 100644
index 0000000..6189255
--- /dev/null
+++ b/examples/oregonator/header.cuh
@@ -0,0 +1,56 @@
+/*!
+ * \file
+ * \brief An example header file that defines system size, memory functions and other required methods
+ *        for integration of the Oregonator's equation with the CUDA solvers
+ */
+
+#ifndef HEADER_GUARD_CUH
+#define HEADER_GUARD_CUH
+
+#include <stdlib.h>
+#include "gpu_macros.cuh"
+#include "gou_memory.cuh"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+//! The IVP system size
+#define NSP (3)
+//! Input vector size (in read_initial_conditions)
+#define NN (NSP)
+
+/*!
+ *
+ * \brief Set same ICs for all problems
+ * \param NUM       The number of initial value problems
+ * \param y_host    The state vectors to initialize
+ * \param var_host  The vector of \f$mu\f$ parameters for the Oregonator equation, not working now
+ *
+ */
+void set_same_initial_conditions(int NUM, double** y_host, double** var_host);
+
+//dummy definitions that are used for pyJac
+
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_mask(double* y_host);
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_reverse_mask(double* y_host);
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
+
+#endif
diff --git a/examples/oregonator/header.h b/examples/oregonator/header.h
new file mode 100644
index 0000000..737388a
--- /dev/null
+++ b/examples/oregonator/header.h
@@ -0,0 +1,62 @@
+/*!
+ * \file
+ * \brief An example header file that defines system size and other required methods
+ *        for integration of the Oregonator's equation.
+ */
+
+#ifndef HEADER_GUARD_H
+#define HEADER_GUARD_H
+
+#include <stdlib.h>
+#include <math.h>
+
+//include OpenMP if available
+#ifdef _OPENMP
+ #include <omp.h>
+#else
+ #define omp_get_max_threads() 1
+ #define omp_get_num_threads() 1
+#endif
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator {
+#endif
+
+//! The IVP system size
+#define NSP (3)
+//! Input vector size (in read_initial_conditions)
+#define NN (NSP)
+
+/*!
+ *
+ * \brief Set same ICs for all problems
+ * \param NUM       The number of initial value problems
+ * \param y_host    The state vectors to initialize
+ * \param var_host  The vector of \f$mu\f$ parameters for the Oregonator equation, not currently working
+ *
+ */
+void set_same_initial_conditions(int NUM, double** y_host, double** var_host);
+
+//dummy definitions that are used for pyJac
+
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_mask(double* y_host);
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_reverse_mask(double* y_host);
+
+#ifdef GENERATE_DOCS
+}
+#endif
+
+#endif
diff --git a/examples/oregonator/ics.c b/examples/oregonator/ics.c
new file mode 100644
index 0000000..26a86dc
--- /dev/null
+++ b/examples/oregonator/ics.c
@@ -0,0 +1,58 @@
+/**
+ * \file
+ * \brief Sets same Initial Conditions (ICs) for all problems
+ *
+ * This provides simple definitions of the set_same_initial_conditions function
+ * used when SAME_ICS is defined
+ */
+
+#include "header.h"
+
+#ifdef GENERATE_DOCS
+//put this in the van der Pol namespace for documentation
+namespace oregonator {
+#endif
+
+
+/*!
+ *
+ * \brief Set same ICs for all problems
+ * \param NUM       The number of initial value problems
+ * \param y_host    The state vectors to initialize
+ * \param var_host  The vector of \f$\mu\f$ parameters for the Oregonator equation, not currently working
+ *
+ */
+void set_same_initial_conditions(int NUM, double** y_host, double** var_host)
+{
+    //init vectors
+    (*y_host) = (double*)malloc(NUM * NSP * sizeof(double));
+    (*var_host) = (double*)malloc(NUM * sizeof(double));
+    //now set the values
+    for (int i = 0; i < NUM; ++i){
+      //Dummy value, this won't do anything important for the Oregonator
+      (*var_host)[i] = 1000;
+      //set y values
+      (*y_host)[i] = 1;
+      (*y_host)[i + NUM] = 1;
+      (*y_host)[i + 2 * NUM] = 2;
+    }
+}
+
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_mask(double* y_host) {}
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_reverse_mask(double* y_host) {}
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/ics.cu b/examples/oregonator/ics.cu
new file mode 100644
index 0000000..10e0310
--- /dev/null
+++ b/examples/oregonator/ics.cu
@@ -0,0 +1,58 @@
+/**
+ * \file
+ * \brief Sets same Initial Conditions (ICs) for all problems
+ *
+ * This provides simple definitions of the set_same_initial_conditions function
+ * used when SAME_ICS is defined
+ */
+
+#include "header.h"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+
+/*!
+ *
+ * \brief Set same ICs for all problems
+ * \param NUM       The number of initial value problems
+ * \param y_host    The state vectors to initialize
+ * \param var_host  The vector of \f$\mu\f$ parameters for the Oregonator equation
+ *
+ */
+void set_same_initial_conditions(int NUM, double** y_host, double** var_host)
+{
+    //init vectors
+    (*y_host) = (double*)malloc(NUM * NSP * sizeof(double));
+    (*var_host) = (double*)malloc(NUM * sizeof(double));
+    //now set the values
+    for (int i = 0; i < NUM; ++i){
+      //Dummy value, this won't do anything important for the Oregonator
+      (*var_host)[i] = 1000;
+      //set y values
+      (*y_host)[i] = 1;
+      (*y_host)[i + NUM] = 1;
+      (*y_host)[i + 2 * NUM] = 2;
+    }
+}
+
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_mask(double* y_host) {}
+/**
+ * \brief Not needed for Oregonator
+ *
+ *  In pyJac, these are used to transform the input/output vectors to deal with moving the
+ *         last species mass fraction
+ */
+void apply_reverse_mask(double* y_host) {}
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/jacob.c b/examples/oregonator/jacob.c
new file mode 100644
index 0000000..96a66ad
--- /dev/null
+++ b/examples/oregonator/jacob.c
@@ -0,0 +1,82 @@
+/**
+ * \file
+ * \brief An implementation of the Oregonator jacobian \f$\frac{\partial \dot{\vec{y}}}{\partial \vec{y}}\f$
+ *
+ * Implementes the evaluation of Oregonator Jacobian
+ *
+ */
+
+#include "header.h"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator {
+#endif
+
+#define TRANSPOSE
+
+/**
+ * \brief An implementation of the Oregonator jacobian
+ *
+ * \param[in]           t               The current system time
+ * \param[in]           mu              Dummy parameter, needed for compatibility only
+ * \param[in]           y               The state vector at time t
+ * \param[out]          jac             The jacobian to populate
+ *
+ *  The Jacobian is in a local Column-major (Fortran) order.  As with dydt(), this function operates on local
+ *  copies of the global state vector and jacobian.  Hence simple linear indexing can be used here.
+ *  @see solver_generic.c
+ *
+ */
+void eval_jacob (const double t, const double mu, const double * __restrict__ y, double * __restrict__ jac)
+{
+
+  double s = 77.27;
+  double q = 8.375E-6;
+  double w = 0.161;
+
+#ifndef TRANSPOSE
+  // dydot_0 / dy0
+  jac[0] = s * (-1 * y[1] + 1 - q * 2 * y[0]);
+  // dydot_0 / dy1
+  jac[1] = s * (1 - y[0]);
+  // dydot_0 / dy2
+  jac[2] = 0;
+  // dydot_1 / dy0
+  jac[3] = -1 * y[1] / s;
+  // dydot_1 / dy1
+  jac[4] = (-1 - y[0]) / s;
+  // dydot_1 / dy2
+  jac[5] = 1 / s;
+  // dydot_2 / dy0
+  jac[6] = w;
+  // dydot_2 / dy1
+  jac[7] = 0;
+  // dydot_2 / dy2
+  jac[8] = -1 * w;
+#else
+  // dydot_0 / dy0
+  jac[0] = s * (-1 * y[1] + 1 - q * 2 * y[0]);
+  // dydot_1 / dy0
+  jac[1] = -1 * y[1] / s;
+  // dydot_2 / dy0
+  jac[2] = w;
+  // dydot_0 / dy1
+  jac[3] = s * (1 - y[0]);
+  // dydot_1 / dy1
+  jac[4] = (-1 - y[0]) / s;
+  // dydot_2 / dy1
+  jac[5] = 0;
+  // dydot_0 / dy2
+  jac[6] = 0;
+  // dydot_1 / dy2
+  jac[7] = 1 / s;
+  // dydot_2 / dy2
+  jac[8] = -1 * w;
+#endif
+
+}
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/jacob.cu b/examples/oregonator/jacob.cu
new file mode 100644
index 0000000..7c55b13
--- /dev/null
+++ b/examples/oregonator/jacob.cu
@@ -0,0 +1,47 @@
+/**
+ * \file
+ * \brief A CUDA implementation of the Oregonator jacobian \f$\frac{\partial \dot{\vec{y}}}{\partial \vec{y}}\f$
+ *
+ * Implementes the CUDA evaluation of Oregonator Jacobian
+ *
+ */
+
+#include "header.cuh"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+/**
+ * \brief An implementation of the Oregonator jacobian
+ *
+ * \param[in]           t               The current system time
+ * \param[in]           mu              Dummy parameter, needed for compatibility only
+ * \param[in]           y               The state vector at time t
+ * \param[out]          jac             The jacobian to populate
+ * \param[in]           d_mem           The mechanism_memory struct.  In future versions, this will be used to access the \f$\mu\f$ parameter to have a consistent interface.
+ *
+ *  The Jacobian is in Column-major (Fortran) order.  As with dydt(), this function operates directly on
+ *  global state vector and jacobian.  Hence we use the #INDEX macro defined in gpu_macros.cuh here.
+ *  @see dydt()
+ *
+ */
+__device__
+void eval_jacob (const double t, const double mu, const double * __restrict__ y, double * __restrict__ jac, const mechanism_memory * __restrict__ d_mem)
+{
+  jac[INDEX(0)] = s * (-1 * y[INDEX(1)] + 1 - q * 2 * y[INDEX(0)]);
+  jac[INDEX(1)] = s * (1 - y[INDEX(0)]);
+  jac[INDEX(2)] = 0;
+  jac[INDEX(3)] = -1 * y[INDEX(1)] / s;
+  jac[INDEX(4)] = (-1 - y[INDEX(0)]) / s;
+  jac[INDEX(5)] = 1 / s;
+  jac[INDEX(6)] = w;
+  jac[INDEX(7)] = 0;
+  jac[INDEX(8)] = -1 * w;
+
+}
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/jacob.cuh b/examples/oregonator/jacob.cuh
new file mode 100644
index 0000000..0b642aa
--- /dev/null
+++ b/examples/oregonator/jacob.cuh
@@ -0,0 +1,28 @@
+/**
+ * \file
+ * \brief Contains a header definition for the CUDA van der Pol Jacobian evaluation
+ *
+ */
+
+#ifdef GENERATE_DOCS
+//put this in the van der Pol namespace for documentation
+namespace van_der_pol_cu {
+#endif
+
+
+/**
+ * \brief An implementation of the van der Pol jacobian
+ *
+ * \param[in]           t               The current system time
+ * \param[in]           mu              The van der Pol parameter
+ * \param[in]           y               The state vector at time t
+ * \param[out]          jac             The jacobian to populate
+ * \param[in]           d_mem           The mechanism_memory struct.  In future versions, this will be used to access the \f$\mu\f$ parameter to have a consistent interface.
+ *
+ */
+__device__
+void eval_jacob (const double t, const double mu, const double * __restrict__ y, double * __restrict__ jac, const mechanism_memory * __restrict__ d_mem);
+
+#ifdef GENERATE_DOCS
+}
+#endif
\ No newline at end of file
diff --git a/examples/oregonator/jacob.h b/examples/oregonator/jacob.h
new file mode 100644
index 0000000..9fcd8d1
--- /dev/null
+++ b/examples/oregonator/jacob.h
@@ -0,0 +1,32 @@
+/**
+ * \file
+ * \brief Contains a header definition for the Oregonator Jacobian evaluation
+ *
+ */
+
+#ifndef JACOB_H
+#define JACOB_H
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator {
+#endif
+
+
+/**
+ * \brief An implementation of the Oregonator jacobian
+ *
+ * \param[in]           t               The current system time
+ * \param[in]           mu              Dummy parameter, needed for compatibility only
+ * \param[in]           y               The state vector at time t
+ * \param[out]          jac             The jacobian to populate
+ *
+ */
+void eval_jacob (const double t, const double mu, const double * __restrict__ y, double * __restrict__ jac);
+
+#ifdef GENERATE_DOCS
+}
+
+#endif
+
+#endif
diff --git a/examples/oregonator/launch_bounds.cuh b/examples/oregonator/launch_bounds.cuh
new file mode 100644
index 0000000..cce4de4
--- /dev/null
+++ b/examples/oregonator/launch_bounds.cuh
@@ -0,0 +1,16 @@
+/**
+ *  \file
+ *  \brief A number of definitions that control CUDA kernel launches
+ */
+
+#ifndef LAUNCH_BOUNDS_CUH
+#define LAUNCH_BOUNDS_CUH
+
+//! The target number of threads per block
+#define TARGET_BLOCK_SIZE (64)
+//! Shared memory per thread
+#define SHARED_SIZE (0)
+//! Large L1 cache active
+#define PREFERL1
+
+#endif
\ No newline at end of file
diff --git a/examples/oregonator/oregonator.dox b/examples/oregonator/oregonator.dox
new file mode 100644
index 0000000..b8ede8a
--- /dev/null
+++ b/examples/oregonator/oregonator.dox
@@ -0,0 +1,133 @@
+/**
+\page vdp The van der Pol problem
+
+The van der Pol problem is a commonly used stiff test problem for ODE solvers.
+The stiffness is 'tunable' via modification of parameter \f$\mu\f$
+A complete reference can be found in:
+
+        Hairer, Noersett, Wanner: Solving Ordinary Differential Equations II, p 157. Springer-Verlag 1991.
+or for a practical example in Matlab, [see here](http://www.ece.northwestern.edu/local-apps/matlabhelp/techdoc/math_anal/diffeq6.html).
+
+\section defn Problem Definition
+The van der Pol problem can be written as a set of two first-order of non-linear ODE equations:
+
+        \f{align}{
+            & y_1^\prime =&& y_2 \, & y_1(0) =&& y_1^\circ \nonumber \\
+            & y_2^\prime =&& \mu \left(1 - y_1^2y_2\right) - y_1 \, & y_2(0) =&&  y_2^\circ \nonumber
+          \f}
+
+For our purposes, we shall use:
+
+        \f{align}{
+           y_1^\circ &= 2 \nonumber \\
+           y_2^\circ &= 0 \nonumber \\
+           \mu&= 1000 \nonumber
+          \f}
+
+for [comparison purposes](http://www.ece.northwestern.edu/local-apps/matlabhelp/techdoc/math_anal/diffeq6.html#37714)
+
+\section accelerint_indx A Note on Indexing
+
+The format of arrays expected by `accelerInt` is of note.
+
+\attention In all cases, the C-solvers operate on local copies of the state vectors and Jacobian matrix.
+This means that the problem index (i.e. which IVP is being solved) does not enter into vector or matrix indexing.
+The memory layout is still a Column-major format for the Jacobian matrix.
+To see an example of proper Jacobian layout and indexing for C-solvers, see jacob.c
+@see jacob.c
+
+\attention In the CUDA case however, the state vectors and Jacobian matrix remain in global memory by necessity due to limitations
+on the size of local memory per-thread.  Here, the IVP index does factor into indexing concerns, and can be accessed using the built-in CUDA thread indexing parameters (e.g. `threadIdx.x`, etc.).
+Convienience macros are defined in gpu_macros.cuh to aid indexing; for an example of Jacobian layout and indexing for the CUDA-case, see jacob.cu.
+@see #INDEX, gpu_macros.cuh, jacob.cu
+
+\section impl Implementation
+
+This section details the implementation files required for the C and CUDA solvers.
+
+\subsection rhs RHS implementation
+
+First, both languages need a dydt() function implementation.
+Both a header and implementation file are required.
+- For the C-solvers, the implementation is in dydt.c and the header definition in dydt.h
+- For the CUDA-sovlers, the implementation and headers are dydt.cu and dydt.cuh respectively.
+
+\subsection jac Jacobian
+
+If #FINITE_DIFFERENCE is not specified in the \ref scons_opts "SCons Options", an analytical Jacobian implementation must be must be specified.
+Both a header and implementation file are required:
+- See jacob.h and jacob.c for the C-solvers
+- See jacob.cuh and jacob.cu for the C-solvers
+
+\subsection header System Header
+
+The ODE header.h file contains several important macros and definitions.
+- First, the macro #NSP defines the number of variables in the state vector, i.e. 2 for this problem.
+- Second the macro #NN should be defined to be equal to #NSP.
+See \ref ics for more information.
+
+The header.h file also contains header definitions for several functions dealing with initial conditions, set_same_initial_conditions(), apply_mask() and apply_reverse_mask().
+
+The CUDA header.cuh file is similar to the C-version, with a few additional methods regarding \ref GPU_mem GPU Memory initialization.
+It contains the mechanism_memory struct, which defines memory needed for RHS and Jacobian evaluation, as well as the initialize_gpu_memory(), free_gpu_memory(), and required_mechanism_size() methods, which initialize, free
+and calculate the required GPU memory respectively.
+
+\subsection GPU_mem GPU Memory Initialization
+
+In order to operate the CUDA solvers, the user must implement some additional functions designed to properly initialize the required GPU memory.
+`accelerInt` pre-allocates the required memory for the solver and ODE model in the GPU's global memory to get around use of local per-thread (stack) memory allocation which becomes limiting for large ODE systems.
+
+- required_mechanism_size() determines the amount of memory (in bytes) required per each individual CUDA thread.  This is used to determine the maximum number of threads that can be launched per GPU and to split integration into batches if necessary.  This is similar to the various required_solver_size() implementations for the solvers.
+- initialize_gpu_memory() initializes the host and device versions of the mechanism_memory structs, similar to the various initialize_solver() implementations for the GPU solvers
+- free_gpu_memory() cleans up the host and device mechanism_memory structs as with the cleanup_solver() implementation for each GPU solver.
+
+\subsection launch GPU Launch Bounds
+
+The launch_bounds.cuh file contains definitions for a few variables that control CUDA kernel launches:
+- #TARGET_BLOCK_SIZE defines the number of threads per CUDA block
+- #SHARED_SIZE defines the size of shared memory per CUDA block.  This is not currently used by the solvers, but could potentially be used by a used defined RHS / Jacobian implementation.
+- #PREFERL1 tells CUDA to prefer a larger L1 cache over more shared memory.  It is recommended to keep this on.
+
+\subsection ics Initial Conditions
+
+The initial condition reader was designed with @pyJac in mind, it is currently quite cumbersome to use for other data formats.
+
+- Although this will be upgraded in future releases, currently the #NN macro should be defined to be equal to #NSP.
+- Additionally, the initial conditions reader expects the following data format:
+
+        \f{align}{ \nonumber
+           \text{time}, y_1, \text{parameter}, y_2,& y_3, ... y_{NSP} \text{ (State 1)}\\ \nonumber
+           \text{time}, y_1, \text{parameter}, y_2,& y_3, ... y_{NSP} \text{ (State 2)}\\ \nonumber
+           \vdots&
+        \f}
+An example of how to write a file in this format can be found in generate_ics.py
+
+The set_same_initial_conditions(), defined in ics.c and ics.cu defines a simple function to set
+the initial conditions of the state vector if the \ref scons_opts "SCons option" #SAME_IC is used.
+- In this case, we use this method to set the initial conditions described in \ref defn.
+
+Additionally, ics.c / ics.cu defines the dummy methods apply_mask() and apply_reverse_mask(), which are not used for the van der Pol problem.
+
+@see read_initial_conditions.c, read_initial_conditions.cu
+
+\subsection mult Matrix-Vector Multiplier
+
+Finally, the sparse_multiplier.c, sparse_multiplier.cu, sparse_multiplier.h and sparse_multiplier.cuh provide headers and definitions for a "sparse"
+(unrolled) matrix multiplier.
+This is used by the exponential solvers (and may provide speedups if a sparse Jacobian is used).
+
+\section compilation Compiling the van der Pol problem.
+
+We compile the problem with the SCons call:
+
+        scons SAME_IC=True LOG_OUTPUT=True LOG_END_ONLY=False t_end=2000.0 t_step=1.0 mechanism_dir=examples/van_der_pol/ -j 2
+
+This turns on the initial conditions discussed in \ref defn.
+
+- The `j` parameter allows scons to use 2 threads during compilation (vary as necessary)
+- The `mechanism_dir` option tells scons to look in the `van_der_pol` example directory for \ref impl implementation of the required functions described above.
+- We log the output to a file (generated by default in `accelerInt-root/log/solvername.bin`), for plotting.\n
+- The time step #t_step is set to 1 second, while the end time #end_time is set to 2000 seconds.
+- The default error tolerances #ATOL/#RTOL = \f$\num{e-10}\text{ and }\num{e-6}\f$ respectively are used.
+- Additionaly the #PRINT=True option tuns on logging to the screen (if so desired)
+*/
diff --git a/examples/oregonator/plotter.py b/examples/oregonator/plotter.py
new file mode 100644
index 0000000..71ccdda
--- /dev/null
+++ b/examples/oregonator/plotter.py
@@ -0,0 +1,39 @@
+# imports
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+# load log files
+files = [os.path.join('log', file) for file in os.listdir('./log/')
+         if file.endswith('.bin')]
+files = [file for file in files if os.path.isfile(file)]
+
+linestyles = ['-', '--', '-.']
+colorwheel = ['r', 'g', 'b', 'k']
+
+# load data
+for i, file in enumerate(files):
+    arr = np.fromfile(file)
+    # reshape to a 2001 x 3 matrix (time, y1, y2)
+    arr = np.reshape(arr, (2001, 3))
+
+    # get the solver name
+    label = file.split('-')[0]
+    label = label[label.index('/') + 1:]
+    label += '-GPU' if 'gpu' in file else ''
+
+    # plot y1
+    plt.plot(arr[:, 0][::10], arr[:, 1][::10],
+             linestyle=linestyles[i % len(linestyles)],
+             label=label, color=colorwheel[i % len(colorwheel)])
+
+# make legend
+plt.legend(loc=0)
+
+# title and labels
+plt.title('Oregonator equation')
+plt.xlabel('Time(s)')
+plt.ylabel('$y_1$')
+
+# and save fig
+plt.savefig('oregonator.png', dpi=300, size=(5, 3))
diff --git a/examples/oregonator/sparse_multiplier.c b/examples/oregonator/sparse_multiplier.c
new file mode 100644
index 0000000..864956c
--- /dev/null
+++ b/examples/oregonator/sparse_multiplier.c
@@ -0,0 +1,31 @@
+/**
+ * \file
+ * \brief Implementation for Jacobian vector multiplication, used in exponential integrators
+ *
+ */
+
+#include "sparse_multiplier.h"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator {
+#endif
+
+
+/**
+ * \brief Implements Jacobian \ vector multiplication in sparse (or unrolled) form
+ * \param[in]           A           The (NSP x NSP) Jacobian matrix, see eval_jacob() for details on layout
+ * \param[in]           Vm          The (NSP x 1) vector to multiply by
+ * \param[out]          w           The (NSP x 1) vector to store the result in, \f$w := A * Vm\f$
+ */
+ //Needs to be modified for Oregonator
+void sparse_multiplier(const double * A, const double * Vm, double* w) {
+  w[0] =  A[0] * Vm[0] +  A[3] * Vm[1] + A[6] * Vm[2];
+  w[1] =  A[1] * Vm[0] +  A[4] * Vm[1] + A[7] * Vm[2];
+  w[2] =  A[2] * Vm[0] +  A[5] * Vm[1] + A[8] * Vm[2];
+}
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/sparse_multiplier.cu b/examples/oregonator/sparse_multiplier.cu
new file mode 100644
index 0000000..37cad4d
--- /dev/null
+++ b/examples/oregonator/sparse_multiplier.cu
@@ -0,0 +1,31 @@
+/**
+ * \file
+ * \brief Implementation for CUDA Jacobian vector multiplication, used in exponential integrators
+ *
+ */
+
+#include "sparse_multiplier.cuh"
+
+#ifdef GENERATE_DOCS
+//put this in the Oregonator namespace for documentation
+namespace oregonator_cu {
+#endif
+
+
+/**
+ * \brief Implements Jacobian \ vector multiplication in sparse (or unrolled) form
+ * \param[in]           A           The (NSP x NSP) Jacobian matrix, see eval_jacob() for details on layout
+ * \param[in]           Vm          The (NSP x 1) vector to multiply by
+ * \param[out]          w           The (NSP x 1) vector to store the result in, \f$w := A * Vm\f$
+ */
+__device__
+void sparse_multiplier(const double * A, const double * Vm, double* w) {
+  w[INDEX(0)] =  A[INDEX(0)] * Vm[INDEX(0)] +  A[INDEX(3)] * Vm[INDEX(1)] + A[INDEX(6)] * Vm[INDEX(2)];
+  w[INDEX(1)] =  A[INDEX(1)] * Vm[INDEX(0)] +  A[INDEX(4)] * Vm[INDEX(1)] + A[INDEX(7)] * Vm[INDEX(2)];
+  w[INDEX(2)] =  A[INDEX(2)] * Vm[INDEX(0)] +  A[INDEX(5)] * Vm[INDEX(1)] + A[INDEX(8)] * Vm[INDEX(2)];
+}
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
diff --git a/examples/oregonator/sparse_multiplier.cuh b/examples/oregonator/sparse_multiplier.cuh
new file mode 100644
index 0000000..71181b3
--- /dev/null
+++ b/examples/oregonator/sparse_multiplier.cuh
@@ -0,0 +1,32 @@
+/**
+ * \file
+ * \brief Header definition for CUDA Jacobian vector multiplier, used in exponential integrators
+ *
+ */
+
+#ifndef SPARSE_HEAD_CUH
+#define SPARSE_HEAD_CUH
+
+#include "header.cuh"
+
+
+#ifdef GENERATE_DOCS
+//put this in the van der Pol namespace for documentation
+namespace van_der_pol_cu {
+#endif
+
+/**
+ * \brief Implements Jacobian \ vector multiplication in sparse (or unrolled) form
+ * \param[in]           A           The (NSP x NSP) Jacobian matrix, see eval_jacob() for details on layout
+ * \param[in]           Vm          The (NSP x 1) vector to multiply by
+ * \param[out]          w           The (NSP x 1) vector to store the result in, \f$w := A * Vm\f$
+ */
+__device__
+void sparse_multiplier (const double * A, const double * Vm, double* w);
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
+
+#endif
diff --git a/examples/oregonator/sparse_multiplier.h b/examples/oregonator/sparse_multiplier.h
new file mode 100644
index 0000000..1db49a7
--- /dev/null
+++ b/examples/oregonator/sparse_multiplier.h
@@ -0,0 +1,25 @@
+/**
+ * \file
+ * \brief Header definition for Jacobian vector multiplier, used in exponential integrators
+ *
+ */
+
+#ifndef SPARSE_HEAD
+#define SPARSE_HEAD
+
+#include "header.h"
+
+
+#ifdef GENERATE_DOCS
+//put this in the van der Pol namespace for documentation
+namespace van_der_pol {
+#endif
+
+void sparse_multiplier (const double *, const double *, double*);
+
+
+#ifdef GENERATE_DOCS
+}
+#endif
+
+#endif

From debbfe1a13e16a36082e8e522f35fa4e4cb80ee4 Mon Sep 17 00:00:00 2001
From: Andrew Alferman <alfermaa@oregonstate.edu>
Date: Wed, 2 May 2018 11:30:00 -0700
Subject: [PATCH 2/2] Cleared the dox file.

---
 examples/oregonator/oregonator.dox | 132 +----------------------------
 1 file changed, 2 insertions(+), 130 deletions(-)

diff --git a/examples/oregonator/oregonator.dox b/examples/oregonator/oregonator.dox
index b8ede8a..0159883 100644
--- a/examples/oregonator/oregonator.dox
+++ b/examples/oregonator/oregonator.dox
@@ -1,133 +1,5 @@
 /**
-\page vdp The van der Pol problem
+\page oregonator The Oregonator problem
 
-The van der Pol problem is a commonly used stiff test problem for ODE solvers.
-The stiffness is 'tunable' via modification of parameter \f$\mu\f$
-A complete reference can be found in:
-
-        Hairer, Noersett, Wanner: Solving Ordinary Differential Equations II, p 157. Springer-Verlag 1991.
-or for a practical example in Matlab, [see here](http://www.ece.northwestern.edu/local-apps/matlabhelp/techdoc/math_anal/diffeq6.html).
-
-\section defn Problem Definition
-The van der Pol problem can be written as a set of two first-order of non-linear ODE equations:
-
-        \f{align}{
-            & y_1^\prime =&& y_2 \, & y_1(0) =&& y_1^\circ \nonumber \\
-            & y_2^\prime =&& \mu \left(1 - y_1^2y_2\right) - y_1 \, & y_2(0) =&&  y_2^\circ \nonumber
-          \f}
-
-For our purposes, we shall use:
-
-        \f{align}{
-           y_1^\circ &= 2 \nonumber \\
-           y_2^\circ &= 0 \nonumber \\
-           \mu&= 1000 \nonumber
-          \f}
-
-for [comparison purposes](http://www.ece.northwestern.edu/local-apps/matlabhelp/techdoc/math_anal/diffeq6.html#37714)
-
-\section accelerint_indx A Note on Indexing
-
-The format of arrays expected by `accelerInt` is of note.
-
-\attention In all cases, the C-solvers operate on local copies of the state vectors and Jacobian matrix.
-This means that the problem index (i.e. which IVP is being solved) does not enter into vector or matrix indexing.
-The memory layout is still a Column-major format for the Jacobian matrix.
-To see an example of proper Jacobian layout and indexing for C-solvers, see jacob.c
-@see jacob.c
-
-\attention In the CUDA case however, the state vectors and Jacobian matrix remain in global memory by necessity due to limitations
-on the size of local memory per-thread.  Here, the IVP index does factor into indexing concerns, and can be accessed using the built-in CUDA thread indexing parameters (e.g. `threadIdx.x`, etc.).
-Convienience macros are defined in gpu_macros.cuh to aid indexing; for an example of Jacobian layout and indexing for the CUDA-case, see jacob.cu.
-@see #INDEX, gpu_macros.cuh, jacob.cu
-
-\section impl Implementation
-
-This section details the implementation files required for the C and CUDA solvers.
-
-\subsection rhs RHS implementation
-
-First, both languages need a dydt() function implementation.
-Both a header and implementation file are required.
-- For the C-solvers, the implementation is in dydt.c and the header definition in dydt.h
-- For the CUDA-sovlers, the implementation and headers are dydt.cu and dydt.cuh respectively.
-
-\subsection jac Jacobian
-
-If #FINITE_DIFFERENCE is not specified in the \ref scons_opts "SCons Options", an analytical Jacobian implementation must be must be specified.
-Both a header and implementation file are required:
-- See jacob.h and jacob.c for the C-solvers
-- See jacob.cuh and jacob.cu for the C-solvers
-
-\subsection header System Header
-
-The ODE header.h file contains several important macros and definitions.
-- First, the macro #NSP defines the number of variables in the state vector, i.e. 2 for this problem.
-- Second the macro #NN should be defined to be equal to #NSP.
-See \ref ics for more information.
-
-The header.h file also contains header definitions for several functions dealing with initial conditions, set_same_initial_conditions(), apply_mask() and apply_reverse_mask().
-
-The CUDA header.cuh file is similar to the C-version, with a few additional methods regarding \ref GPU_mem GPU Memory initialization.
-It contains the mechanism_memory struct, which defines memory needed for RHS and Jacobian evaluation, as well as the initialize_gpu_memory(), free_gpu_memory(), and required_mechanism_size() methods, which initialize, free
-and calculate the required GPU memory respectively.
-
-\subsection GPU_mem GPU Memory Initialization
-
-In order to operate the CUDA solvers, the user must implement some additional functions designed to properly initialize the required GPU memory.
-`accelerInt` pre-allocates the required memory for the solver and ODE model in the GPU's global memory to get around use of local per-thread (stack) memory allocation which becomes limiting for large ODE systems.
-
-- required_mechanism_size() determines the amount of memory (in bytes) required per each individual CUDA thread.  This is used to determine the maximum number of threads that can be launched per GPU and to split integration into batches if necessary.  This is similar to the various required_solver_size() implementations for the solvers.
-- initialize_gpu_memory() initializes the host and device versions of the mechanism_memory structs, similar to the various initialize_solver() implementations for the GPU solvers
-- free_gpu_memory() cleans up the host and device mechanism_memory structs as with the cleanup_solver() implementation for each GPU solver.
-
-\subsection launch GPU Launch Bounds
-
-The launch_bounds.cuh file contains definitions for a few variables that control CUDA kernel launches:
-- #TARGET_BLOCK_SIZE defines the number of threads per CUDA block
-- #SHARED_SIZE defines the size of shared memory per CUDA block.  This is not currently used by the solvers, but could potentially be used by a used defined RHS / Jacobian implementation.
-- #PREFERL1 tells CUDA to prefer a larger L1 cache over more shared memory.  It is recommended to keep this on.
-
-\subsection ics Initial Conditions
-
-The initial condition reader was designed with @pyJac in mind, it is currently quite cumbersome to use for other data formats.
-
-- Although this will be upgraded in future releases, currently the #NN macro should be defined to be equal to #NSP.
-- Additionally, the initial conditions reader expects the following data format:
-
-        \f{align}{ \nonumber
-           \text{time}, y_1, \text{parameter}, y_2,& y_3, ... y_{NSP} \text{ (State 1)}\\ \nonumber
-           \text{time}, y_1, \text{parameter}, y_2,& y_3, ... y_{NSP} \text{ (State 2)}\\ \nonumber
-           \vdots&
-        \f}
-An example of how to write a file in this format can be found in generate_ics.py
-
-The set_same_initial_conditions(), defined in ics.c and ics.cu defines a simple function to set
-the initial conditions of the state vector if the \ref scons_opts "SCons option" #SAME_IC is used.
-- In this case, we use this method to set the initial conditions described in \ref defn.
-
-Additionally, ics.c / ics.cu defines the dummy methods apply_mask() and apply_reverse_mask(), which are not used for the van der Pol problem.
-
-@see read_initial_conditions.c, read_initial_conditions.cu
-
-\subsection mult Matrix-Vector Multiplier
-
-Finally, the sparse_multiplier.c, sparse_multiplier.cu, sparse_multiplier.h and sparse_multiplier.cuh provide headers and definitions for a "sparse"
-(unrolled) matrix multiplier.
-This is used by the exponential solvers (and may provide speedups if a sparse Jacobian is used).
-
-\section compilation Compiling the van der Pol problem.
-
-We compile the problem with the SCons call:
-
-        scons SAME_IC=True LOG_OUTPUT=True LOG_END_ONLY=False t_end=2000.0 t_step=1.0 mechanism_dir=examples/van_der_pol/ -j 2
-
-This turns on the initial conditions discussed in \ref defn.
-
-- The `j` parameter allows scons to use 2 threads during compilation (vary as necessary)
-- The `mechanism_dir` option tells scons to look in the `van_der_pol` example directory for \ref impl implementation of the required functions described above.
-- We log the output to a file (generated by default in `accelerInt-root/log/solvername.bin`), for plotting.\n
-- The time step #t_step is set to 1 second, while the end time #end_time is set to 2000 seconds.
-- The default error tolerances #ATOL/#RTOL = \f$\num{e-10}\text{ and }\num{e-6}\f$ respectively are used.
-- Additionaly the #PRINT=True option tuns on logging to the screen (if so desired)
+More to come!
 */