Skip to content

Commit

Permalink
v4.2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Yidasvc committed Dec 13, 2024
1 parent 7a8798d commit a5ab648
Show file tree
Hide file tree
Showing 22 changed files with 1,801 additions and 821 deletions.
Binary file added PanguLU_Users_s_Guide.pdf
Binary file not shown.
34 changes: 19 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,32 +154,36 @@ In this example, 6 processes are used to test, the block_size is 4, matrix name

## Release versions

#### <p align='left'>Version 4.1.0 (Sep. 01, 2024) </p>
#### <p align='left'>Version 4.2.0 (Dec. 13, 2024) </p>

* Optimized memory usage of numeric factorisation and solving.
* Updated preprocessing phase to distributed data structure.

#### <p align='left'>Version 4.1.0 (Sep. 1, 2024) </p>

* Optimized memory usage of numeric factorisation and solving;
* Added parallel building support.

#### <p align='left'>Version 4.0.0 (Jul. 24, 2024) </p>

* Optimized user interfaces of solver routines.
* Optimized performance of numeric factorisation phase on CPU platforms.
* Added support on complex matrix solving.
* Optimized pre-processing performance.
* Optimized user interfaces of solver routines;
* Optimized performamce of numeric factorisation phase on CPU platform;
* Added support on complex matrix solving;
* Optimized preprocessing performance;

#### <p align='left'>Version 3.5.0 (Aug. 06, 2023) </p>

* Updated the pre-processing phase with OpenMP.
* Updated the compilation method, compilling libpangulu.so and libpangulu.a at the same time.
* Updated timing for the reordering phase, the symbolic factorisation phase, and the pre-processing phase.
* Computed GFLOPS for the numeric factorisation phase.
* Updated the compilation method of PanguLU, compile libpangulu.so and libpangulu.a at the same time.
* Updated timing for the reorder phase, the symbolic factorisation phase, the pre-processing phase.
* Added GFLOPS for the numeric factorisation phase.

#### <p align='left'>Version 3.0.0 (Apr. 02, 2023) </p>

* Used an adaptive method for selecting sparse BLAS in the numeric factorisation phase.
* Added the reordering phase.
* Added the symbolic factorisation phase.
* Added the MC64 algorithm in the reordering phase.
* Added an interface for 64-bit METIS package in the reordering phase.
* Used adaptive selection sparse BLAS in the numeric factorisation phase.
* Added the reorder phase.
* Added the symbolic factorisation phase.
* Added mc64 sorting algorithm in the reorder phase.
* Added interface for 64-bit metis package in the reorder phase.


#### <p align='left'> Version 2.0.0 (Jul. &thinsp;22, 2022) </p>
Expand All @@ -191,7 +195,7 @@ In this example, 6 processes are used to test, the block_size is 4, matrix name
#### <p align='left'>Version 1.0.0 (Oct. 19, 2021) </p>

* Used a rule-based 2D LU factorisation scheduling strategy.
* Used sparse BLAS for floating point calculations on GPUs.
* Used Sparse BLAS for floating point calculations on GPUs.
* Added the pre-processing phase.
* Added the numeric factorisation phase.
* Added the triangular solve phase.
Expand Down
1 change: 1 addition & 0 deletions examples/mmio_highlevel.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ int mmio_info(sparse_index_t *m, sparse_index_t *n, sparse_pointer_t *nnz, spars
// free tmp space
free(csr_colIdx_tmp);
free(csr_rowIdx_tmp);
free(csr_row_ptr_counter);

return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions examples/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ if [ ! -f $1 ];then
exit
fi

echo mpirun -np $[$NP] ./$numeric_file -nb $nb -f $Smatrix_name
echo mpirun -np $NP ./$numeric_file -nb $nb -f $Smatrix_name

mpirun -np $[$NP] ./$numeric_file -nb $nb -f $Smatrix_name
mpirun -np $NP ./$numeric_file -nb $nb -f $Smatrix_name
14 changes: 7 additions & 7 deletions make.inc
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
COMPILE_LEVEL = -O3

#0201000,GPU_CUDA
CUDA_PATH = /path/to/cuda
CUDA_PATH = /usr/local/cuda
CUDA_INC = -I$(CUDA_PATH)/include
CUDA_LIB = -L$(CUDA_PATH)/lib64 -lcudart -lcusparse
NVCC = nvcc $(COMPILE_LEVEL)
NVCCFLAGS = $(PANGULU_FLAGS) -w -Xptxas -dlcm=cg -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61 $(CUDA_INC) $(CUDA_LIB)

#general
CC = gcc $(COMPILE_LEVEL)
MPICC = mpicc $(COMPILE_LEVEL)
OPENBLAS_INC = -I/path/to/OpenBLAS/include
OPENBLAS_LIB = -L/path/to/OpenBLAS/lib -lopenblas
CC = gcc $(COMPILE_LEVEL) #-fsanitize=address
MPICC = mpicc $(COMPILE_LEVEL) #-fsanitize=address
OPENBLAS_INC = -I$(PATH_TO_OPENBLAS_INC)
OPENBLAS_LIB = -L$(PATH_TO_OPENBLAS_LIB) -lopenblas
MPICCFLAGS = $(OPENBLAS_INC) $(CUDA_INC) $(OPENBLAS_LIB) -fopenmp -lpthread -lm
MPICCLINK = $(OPENBLAS_LIB)
METISFLAGS = -I/path/to/GKlib/include -I/path/to/METIS/include
PANGULU_FLAGS = -DPANGULU_LOG_INFO -DCALCULATE_TYPE_R64 -DPANGULU_MC64 -DMETIS #-DGPU_OPEN -DHT_IS_OPEN
METISFLAGS = -I$(PATH_TO_GKLIB_INC) -I$(PATH_TO_METIS_I64_INC)
PANGULU_FLAGS = -DPANGULU_LOG_INFO -DCALCULATE_TYPE_R64 -DMETIS -DPANGULU_MC64 #-DGPU_OPEN -DHT_IS_OPEN
62 changes: 62 additions & 0 deletions src/languages/pangulu_en.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#ifdef PANGULU_EN

#ifdef PANGULU_LOG_ERROR
#define PANGULU_E_NB_IS_ZERO "[PanguLU Error] nb is zero.\n"
#define PANGULU_E_INVALID_HEAP_SELECT "[PanguLU Error] Invalid heap comparing strategy.\n"
#define PANGULU_E_HEAP_FULL "[PanguLU Error] The heap is full on rank " FMT_PANGULU_INT32_T ".\n", rank
#define PANGULU_E_HEAP_EMPTY "[PanguLU Error] The heap is empty on rank " FMT_PANGULU_INT32_T ".\n", rank
#define PANGULU_E_CPU_MEM "[PanguLU Error] Failed to allocate " FMT_PANGULU_INT64_T " byte(s). CPU memory is not enough. %s:" FMT_PANGULU_INT64_T "\n", size, file, line
#define PANGULU_E_ISEND_CSR "[PanguLU Error] pangulu_isend_whole_pangulu_smatrix_csr error. value != s->value.\n"
#define PANGULU_E_ISEND_CSC "[PanguLU Error] pangulu_isend_whole_pangulu_smatrix_csc error. value != s->value_csc.\n"
#define PANGULU_E_ROW_IS_NULL "[PanguLU Error] The matrix has zero row(s).\n"
#define PANGULU_E_ROW_DONT_HAVE_DIA "[PanguLU Error] Row[" FMT_PANGULU_EXBLOCK_IDX "] don't have diagonal element.\n", i
#define PANGULU_E_ERR_IN_RRCL "[PanguLU Error] Invalid numeric factorization task on rank " FMT_PANGULU_INT32_T ". row=" FMT_PANGULU_INT64_T " col=" FMT_PANGULU_INT64_T " level=" FMT_PANGULU_INT64_T "\n", rank, row, col, level
#define PANGULU_E_K_ID "[PanguLU Error] Invalid kernel id " FMT_PANGULU_INT64_T " for numeric factorization.\n", kernel_id
#define PANGULU_E_ASYM "[PanguLU Error] MPI_Barrier_asym error.\n"
#define PANGULU_E_ADD_DIA "[PanguLU Error] pangulu_add_diagonal_element error\n"
#define PANGULU_E_CUDA_MALLOC "[PanguLU Error] Failed to cudaMalloc %lu byte(s). GPU memory is not enough.\n", size
#define PANGULU_E_ROW_IS_ZERO "[PanguLU Error] Invalid input matrix.\n"
#define PANGULU_E_MAX_NULL "[PanguLU Error] pangulu_mc64 internal error. (now_row_max==0)\n"
#define PANGULU_E_WORK_ERR "[PanguLU Error] Invalid kernel id " FMT_PANGULU_INT64_T " for sptrsv.\n", kernel_id
#define PANGULU_E_BIP_PTR_INVALID "[PanguLU Error] Invalid pangulu_block_info pointer.\n"
#define PANGULU_E_BIP_INVALID "[PanguLU Error] Invalid pangulu_block_info.\n"
#define PANGULU_E_BIP_NOT_EMPTY "[PanguLU Error] Block info pool is not empty.\n"
#define PANGULU_E_BIP_OUT_OF_RANGE "[PanguLU Error] PANGULU_BIP index out of range.\n"
#define PANGULU_E_OPTION_IS_NULLPTR "[PanguLU Error] Option struct pointer is NULL. (pangulu_init)\n"
#define PANGULU_E_GSTRF_OPTION_IS_NULLPTR "[PanguLU Error] Option struct pointer is NULL. (pangulu_gstrf)\n"
#define PANGULU_E_GSTRS_OPTION_IS_NULLPTR "[PanguLU Error] Option struct pointer is NULL. (pangulu_gstrs)\n"
#endif // PANGULU_LOG_ERROR

#ifdef PANGULU_LOG_WARNING
#define PANGULU_W_RANK_HEAP_DONT_NULL "[PanguLU Warning] " FMT_PANGULU_INT64_T " task remaining on rank " FMT_PANGULU_INT32_T ".\n", heap->length, rank
#define PANGULU_W_ERR_RANK "[PanguLU Warning] Receiving message error on rank " FMT_PANGULU_INT32_T ".\n", rank
#define PANGULU_W_BIP_INCREASE_SPEED_TOO_SMALL "[PanguLU Warning] PANGULU_BIP_INCREASE_SPEED too small.\n"
#define PANGULU_W_GPU_BIG_BLOCK "[PanguLU Warning] When GPU is open, init_options->nb > 256 and pangulu_inblock_idx isn't pangulu_uint32_t, performance will be limited.\n"
#define PANGULU_W_COMPLEX_FALLBACK "[PanguLU Warning] Calculating complex value on GPU is not supported. Fallback to CPU.\n"
#endif // PANGULU_LOG_WARNING

#ifdef PANGULU_LOG_INFO
#define PANGULU_I_VECT2NORM_ERR "[PanguLU Info] || Ax - B || / || Ax || = %12.4le.\n", error
#define PANGULU_I_CHECK_PASS "[PanguLU Info] Check ------------------------------------- pass\n"
#define PANGULU_I_CHECK_ERROR "[PanguLU Info] Check ------------------------------------ error\n"
#define PANGULU_I_DEV_IS "[PanguLU Info] Device is %s.\n", prop.name
#define PANGULU_I_TASK_INFO "[PanguLU Info] Info of inserting task is: row=" FMT_PANGULU_INT64_T " col=" FMT_PANGULU_INT64_T " level=" FMT_PANGULU_INT64_T " kernel=" FMT_PANGULU_INT64_T ".\n", row, col, task_level, kernel_id
#define PANGULU_I_HEAP_LEN "[PanguLU Info] heap.length=" FMT_PANGULU_INT64_T " heap.capacity=" FMT_PANGULU_INT64_T "\n", heap->length, heap->max_length
#define PANGULU_I_ADAPTIVE_KERNEL_SELECTION_ON "[PanguLU Info] ADAPTIVE_KERNEL_SELECTION ------------- ON\n"
#define PANGULU_I_ADAPTIVE_KERNEL_SELECTION_OFF "[PanguLU Info] ADAPTIVE_KERNEL_SELECTION ------------- OFF\n"
#define PANGULU_I_SYNCHRONIZE_FREE_ON "[PanguLU Info] SYNCHRONIZE_FREE ---------------------- ON\n"
#define PANGULU_I_SYNCHRONIZE_FREE_OFF "[PanguLU Info] SYNCHRONIZE_FREE ---------------------- OFF\n"
#ifdef METIS
#define PANGULU_I_BASIC_INFO "[PanguLU Info] n=" FMT_PANGULU_INT64_T " nnz=" FMT_PANGULU_EXBLOCK_PTR " nb=" FMT_PANGULU_INT32_T " mpi_process=" FMT_PANGULU_INT32_T " preprocessing_thread=%d METIS:%s\n", n, origin_smatrix->rowpointer[n], nb, size, init_options->nthread, (sizeof(idx_t) == 4) ? ("i32") : ((sizeof(idx_t) == 8) ? ("i64") : ("?"))
#else
#define PANGULU_I_BASIC_INFO "[PanguLU Info] n=" FMT_PANGULU_INT64_T " nnz=" FMT_PANGULU_EXBLOCK_PTR " nb=" FMT_PANGULU_INT32_T " mpi_process=" FMT_PANGULU_INT32_T " preprocessing_thread=%d\n", n, origin_smatrix->rowpointer[n], nb, size, init_options->nthread
#endif
#define PANGULU_I_TIME_REORDER "[PanguLU Info] Reordering time is %lf s.\n", elapsed_time
#define PANGULU_I_TIME_SYMBOLIC "[PanguLU Info] Symbolic factorization time is %lf s.\n", elapsed_time
#define PANGULU_I_TIME_PRE "[PanguLU Info] Preprocessing time is %lf s.\n", elapsed_time
#define PANGULU_I_TIME_NUMERICAL "[PanguLU Info] Numeric factorization time is %lf s.\n", elapsed_time //, flop / pangulu_get_spend_time(common) / 1000000000.0
#define PANGULU_I_TIME_SPTRSV "[PanguLU Info] Solving time is %lf s.\n", elapsed_time
#define PANGULU_I_SYMBOLIC_NONZERO "[PanguLU Info] Symbolic nonzero count is " FMT_PANGULU_EXBLOCK_PTR ".\n",*symbolic_nnz
#endif // PANGULU_LOG_INFO

#endif // #ifdef PANGULU_EN
65 changes: 34 additions & 31 deletions src/pangulu.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pangulu_int64_t INDEX_NUM = 0;
pangulu_int32_t pangu_omp_num_threads = 1;

pangulu_int64_t flop = 0;
double time_transport = 0.0;
double time_transpose = 0.0;
double time_isend = 0.0;
double time_receive = 0.0;
double time_getrf = 0.0;
Expand Down Expand Up @@ -46,6 +46,9 @@ void pangulu_init(pangulu_exblock_idx pangulu_n, pangulu_exblock_ptr pangulu_nnz
{
MPI_Comm_rank(MPI_COMM_WORLD, &rank);

struct timeval time_start;
double elapsed_time;

pangulu_int32_t size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
pangulu_common *common = (pangulu_common *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_common));
Expand Down Expand Up @@ -111,7 +114,6 @@ void pangulu_init(pangulu_exblock_idx pangulu_n, pangulu_exblock_ptr pangulu_nnz
pangulu_int32_t q = common->q;
pangulu_int32_t nb = common->nb;
MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_start(common);
common->n = pangulu_bcast_n(origin_smatrix->row, 0);
pangulu_int64_t n = common->n;
omp_set_num_threads(init_options->nthread);
Expand All @@ -120,16 +122,16 @@ void pangulu_init(pangulu_exblock_idx pangulu_n, pangulu_exblock_ptr pangulu_nnz
#endif
if (rank == 0)
{
#ifdef ADAPTIVE_KERNEL_SELECTION
printf(PANGULU_I_ADAPTIVE_KERNEL_SELECTION_ON);
#else
printf(PANGULU_I_ADAPTIVE_KERNEL_SELECTION_OFF);
#endif
#ifdef SYNCHRONIZE_FREE
printf(PANGULU_I_SYNCHRONIZE_FREE_ON);
#else
printf(PANGULU_I_SYNCHRONIZE_FREE_OFF);
#endif
// #ifdef ADAPTIVE_KERNEL_SELECTION
// printf(PANGULU_I_ADAPTIVE_KERNEL_SELECTION_ON);
// #else
// printf(PANGULU_I_ADAPTIVE_KERNEL_SELECTION_OFF);
// #endif
// #ifdef SYNCHRONIZE_FREE
// printf(PANGULU_I_SYNCHRONIZE_FREE_ON);
// #else
// printf(PANGULU_I_SYNCHRONIZE_FREE_OFF);
// #endif
#ifdef PANGULU_GPU_COMPLEX_FALLBACK_FLAG
printf(PANGULU_W_COMPLEX_FALLBACK);
#endif
Expand Down Expand Up @@ -168,14 +170,14 @@ void pangulu_init(pangulu_exblock_idx pangulu_n, pangulu_exblock_ptr pangulu_nnz
block_common->rank_col_length = (block_common->block_length / q + (((block_common->block_length % q) > (rank % q)) ? 1 : 0));
block_common->every_level_length = PANGULU_MIN(block_common->every_level_length, block_common->block_length);
MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_start(common);
pangulu_time_start(&time_start);

pangulu_reorder(block_smatrix,
origin_smatrix,
reorder_matrix);

MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_stop(common);
elapsed_time = pangulu_time_stop(&time_start);
if (rank == 0)
{
printf(PANGULU_I_TIME_REORDER);
Expand All @@ -184,8 +186,7 @@ void pangulu_init(pangulu_exblock_idx pangulu_n, pangulu_exblock_ptr pangulu_nnz
calculate_time = 0;

MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_start(common);

pangulu_time_start(&time_start);
if (rank == 0)
{
pangulu_symbolic(block_common,
Expand All @@ -194,7 +195,7 @@ void pangulu_init(pangulu_exblock_idx pangulu_n, pangulu_exblock_ptr pangulu_nnz
}

MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_stop(common);
elapsed_time = pangulu_time_stop(&time_start);
if (rank == 0)
{
printf(PANGULU_I_TIME_SYMBOLIC);
Expand All @@ -203,30 +204,26 @@ void pangulu_init(pangulu_exblock_idx pangulu_n, pangulu_exblock_ptr pangulu_nnz
pangulu_init_heap_select(0);

MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_start(common);

pangulu_time_start(&time_start);
pangulu_preprocessing(
block_common,
block_smatrix,
reorder_matrix,
init_options->nthread);

#ifdef PANGULU_SPTRSV

#endif
MPI_Barrier(MPI_COMM_WORLD);

pangulu_time_stop(common);
elapsed_time = pangulu_time_stop(&time_start);
if (rank == 0)
{
printf(PANGULU_I_TIME_PRE);
}

pangulu_free(__FILE__, __LINE__, block_smatrix->symbolic_rowpointer);
block_smatrix->symbolic_rowpointer = NULL;
// pangulu_free(__FILE__, __LINE__, block_smatrix->symbolic_rowpointer);
// block_smatrix->symbolic_rowpointer = NULL;

pangulu_free(__FILE__, __LINE__, block_smatrix->symbolic_columnindex);
block_smatrix->symbolic_columnindex = NULL;
// pangulu_free(__FILE__, __LINE__, block_smatrix->symbolic_columnindex);
// block_smatrix->symbolic_columnindex = NULL;

pangulu_free(__FILE__, __LINE__, origin_smatrix);
origin_smatrix = NULL;
Expand All @@ -249,6 +246,9 @@ void pangulu_gstrf(pangulu_gstrf_options *gstrf_options, void **pangulu_handle)
pangulu_block_smatrix *block_smatrix = (*(pangulu_handle_t **)pangulu_handle)->block_smatrix;
pangulu_common *common = (*(pangulu_handle_t **)pangulu_handle)->commmon;

struct timeval time_start;
double elapsed_time;

if (rank == 0)
{
if (gstrf_options == NULL)
Expand All @@ -270,13 +270,13 @@ void pangulu_gstrf(pangulu_gstrf_options *gstrf_options, void **pangulu_handle)

pangulu_time_init();
MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_start(common);
pangulu_time_start(&time_start);

pangulu_numeric(block_common,
block_smatrix);

MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_stop(common);
elapsed_time = pangulu_time_stop(&time_start);

if (rank == 0)
{
Expand Down Expand Up @@ -306,6 +306,9 @@ void pangulu_gstrs(calculate_type *rhs, pangulu_gstrs_options *gstrs_options, vo
pangulu_block_smatrix *block_smatrix = (*(pangulu_handle_t **)pangulu_handle)->block_smatrix;
pangulu_common *common = (*(pangulu_handle_t **)pangulu_handle)->commmon;

struct timeval time_start;
double elapsed_time;

if (rank == 0)
{
if (gstrs_options == NULL)
Expand Down Expand Up @@ -340,14 +343,14 @@ void pangulu_gstrs(calculate_type *rhs, pangulu_gstrs_options *gstrs_options, vo
#ifdef PANGULU_SPTRSV

MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_start(common);
pangulu_time_start(&time_start);

pangulu_sptrsv_L(block_common, block_smatrix);
pangulu_init_heap_select(4);
pangulu_sptrsv_U(block_common, block_smatrix);

MPI_Barrier(MPI_COMM_WORLD);
pangulu_time_stop(common);
elapsed_time = pangulu_time_stop(&time_start);

if (rank == 0)
{
Expand Down
Loading

0 comments on commit a5ab648

Please sign in to comment.