From 7b19829c7f25456c555b9b367c1f596aa07daba9 Mon Sep 17 00:00:00 2001 From: Damien Delforge Date: Sat, 21 Dec 2024 23:34:50 +0100 Subject: [PATCH] Add initial JOSS paper and GitHub workflow for draft PDF generation This commit introduces the initial JOSS paper `paper.md` along with its associated bibliography file `paper.bib`. Additionally, a GitHub Actions workflow has been added to generate draft PDFs using the Open Journals drafting action, facilitating review and iteration of the paper. --- .github/workflows/draft-pdf.yml | 29 +++++ paper.bib | 201 ++++++++++++++++++++++++++++++++ paper.md | 112 ++++++++++++++++++ 3 files changed, 342 insertions(+) create mode 100644 .github/workflows/draft-pdf.yml create mode 100644 paper.bib create mode 100644 paper.md diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 0000000..fff32ae --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,29 @@ +name: Draft PDF +on: + push: + paths: + - paper.md + - paper.bib + - .github/workflows/draft-pdf.yml + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper.pdf \ No newline at end of file diff --git a/paper.bib b/paper.bib new file mode 100644 index 0000000..c9420ba --- /dev/null +++ b/paper.bib @@ -0,0 +1,201 @@ + +@book{elsner_singular_1996, + address = {Boston, MA}, + title = {Singular {Spectrum} {Analysis}: {A} {New} {Tool} in {Time} {Series} {Analysis}}, + copyright = {http://www.springer.com/tdm}, + isbn = {978-1-4419-3266-2 978-1-4757-2514-8}, + url = {http://link.springer.com/10.1007/978-1-4757-2514-8}, + language = {en}, + urldate = {2024-12-20}, + publisher = {Springer US}, + author = {Elsner, James B. and Tsonis, Anastasios A.}, + year = {1996}, + doi = {10.1007/978-1-4757-2514-8}, + keywords = {foundation, multivariate statistics, noise, statistics, time series}, +} + +@book{golyandina_singular_2020, + address = {Berlin, Heidelberg}, + series = {{SpringerBriefs} in {Statistics}}, + title = {Singular {Spectrum} {Analysis} for {Time} {Series}}, + copyright = {http://www.springer.com/tdm}, + isbn = {978-3-662-62435-7 978-3-662-62436-4}, + url = {http://link.springer.com/10.1007/978-3-662-62436-4}, + language = {en}, + urldate = {2024-12-20}, + publisher = {Springer}, + author = {Golyandina, Nina and Zhigljavsky, Anatoly}, + year = {2020}, + doi = {10.1007/978-3-662-62436-4}, + keywords = {time series, forecasting, singular value decomposition, Multivariate Singular Spectrum Analysis, signal extraction ., signal processing}, +} + +@article{vautard_singular_1989, + title = {Singular spectrum analysis in nonlinear dynamics, with applications to paleoclimatic time series}, + volume = {35}, + issn = {0167-2789}, + url = {https://www.sciencedirect.com/science/article/pii/0167278989900778}, + doi = {10.1016/0167-2789(89)90077-8}, + abstract = {We distinguish between two dimensions of a dynamical system given by experimental time series. Statistical dimension gives a theoretical upper bound for the minimal number of degrees of freedom required to describe tje attractor up to the accuracy of the data, taking into account sampling and noise problems. The dynamical dimension is the intrinsic dimension of the attractor and does not depend on the quality of the data. Singular Spectrum Analysis (SSA) provides estimates of the statistical dimension. SSA also describes the main physical phenomena reflected by the data. It gives adaptive spectral filters associated with the dominant oscillations of the system and clarifies the noise characteristics of the data. We apply SSA to four paleoclimatic records. The principal climatic oscillations, and the regime changes in their amplitude are detected. About 10 degrees of freedom are statistically significant in the data. Large noise and insufficient sample length do not allow reliable estimates of the dynamical dimension.}, + number = {3}, + urldate = {2024-12-21}, + journal = {Physica D: Nonlinear Phenomena}, + author = {Vautard, R. and Ghil, M.}, + month = may, + year = {1989}, + pages = {395--424}, +} + +@article{broomhead_extracting_1986, + title = {Extracting qualitative dynamics from experimental data}, + volume = {20}, + issn = {0167-2789}, + url = {https://www.sciencedirect.com/science/article/pii/016727898690031X}, + doi = {10.1016/0167-2789(86)90031-X}, + abstract = {We consider the notion of qualitative information and the practicalities of extracting it from experimental data. Our approach, based on a theorem of Takens, draws on ideas from the generalized theory of information known as singular system analysis due to Bertero, Pike and co-workers. We illustrate our technique with numerical data from the chaotic regime of the Lorenz model.}, + number = {2}, + urldate = {2024-12-21}, + journal = {Physica D: Nonlinear Phenomena}, + author = {Broomhead, D. S. and King, Gregory P.}, + month = jun, + year = {1986}, + pages = {217--236}, +} + +@article{allen_monte_1996, + title = {Monte {Carlo} {SSA}: {Detecting} irregular oscillations in the {Presence} of {Colored} {Noise}}, + issn = {1520-0442}, + shorttitle = {Monte {Carlo} {SSA}}, + url = {https://journals.ametsoc.org/view/journals/clim/9/12/1520-0442_1996_009_3373_mcsdio_2_0_co_2.xml}, + abstract = {Singular systems (or singular spectrum) analysis (SSA) was originally proposed for noise reduction in the analysis of experimental data and is now becoming widely used to identify intermittent or modulated oscillations in geophysical and climatic time series. Progress has been hindered by a lack of effective statistical tests to discriminate between potential oscillations and anything but the simplest form of noise, that is, “white” (independent, identically distributed) noise, in which power is independent of frequency. The authors show how the basic formalism of SSA provides a natural test for modulated oscillations against an arbitrary “colored noise” null hypothesis. This test, Monte Carlo SSA, is illustrated using synthetic data in three situations: (i) where there is prior knowledge of the power-spectral characteristics of the noise, a situation expected in some laboratory and engineering applications, or when the “noise” against which the data is being tested consists of the output of an independently specified model, such as a climate model; (ii) where a simple hypothetical noise model is tested, namely, that the data consists only of white or colored noise; and (iii) where a composite hypothetical noise model is tested, assuming some deterministic components have already been found in the data, such as a trend or annual cycle, and it needs to be established whether the remainder may be attributed to noise. The authors examine two historical temperature records and show that the strength of the evidence provided by SSA for interannual and interdecadal climate oscillations in such data has been considerably overestimated. In contrast, multiple inter- and subannual oscillatory components are identified in an extended Southern Oscillation index at a high significance level. The authors explore a number of variations on the Monte Carlo SSA algorithm and note that it is readily applicable to multivariate series, covering standard empirical orthogonal functions and multichannel SSA.}, + language = {en}, + urldate = {2024-12-21}, + author = {Allen, Myles R. and Smith, Leonard A.}, + month = dec, + year = {1996}, + note = {Section: Journal of Climate}, +} + +@article{faouzi_pyts_2020, + title = {pyts: {A} {Python} {Package} for {Time} {Series} {Classification}}, + volume = {21}, + url = {http://jmlr.org/papers/v21/19-763.html}, + number = {46}, + journal = {Journal of Machine Learning Research}, + author = {Faouzi, Johann and Janati, Hicham}, + year = {2020}, + pages = {1--6}, +} + +@misc{hammad_pyactigraphy_2024, + title = {{pyActigraphy}: {Open}-source python package for actigraphy data visualization and analysis}, + url = {https://doi.org/10.5281/zenodo.12163161}, + publisher = {Zenodo}, + author = {Hammad, Grégory and Reyt, Mathilde and Beliy, Nikita and Baillet, Marion and Deantoni, Michele and Lesoinne, Alexia and Muto, Vincenzo and Schmidt, Christina}, + month = jun, + year = {2024}, + doi = {10.5281/zenodo.12163161}, +} + +@misc{khider_pyleoclim_2023, + title = {Pyleoclim: {A} {Python} package for the analysis and visualization of paleoclimate data}, + url = {https://doi.org/10.5281/zenodo.7523617}, + publisher = {Zenodo}, + author = {Khider, Deborah and Emile-Geay, Julien and Zhu, Feng and James, Alexander and Landers, Jordan and Kwan, Myron and Athreya, Pratheek}, + month = jan, + year = {2023}, + doi = {10.5281/zenodo.7523617}, +} + +@misc{halko_finding_2010, + title = {Finding structure with randomness: {Probabilistic} algorithms for constructing approximate matrix decompositions}, + shorttitle = {Finding structure with randomness}, + url = {http://arxiv.org/abs/0909.4061}, + doi = {10.48550/arXiv.0909.4061}, + abstract = {Low-rank matrix approximations, such as the truncated singular value decomposition and the rank-revealing QR decomposition, play a central role in data analysis and scientific computing. This work surveys and extends recent research which demonstrates that randomization offers a powerful tool for performing low-rank matrix approximation. These techniques exploit modern computational architectures more fully than classical methods and open the possibility of dealing with truly massive data sets. This paper presents a modular framework for constructing randomized algorithms that compute partial matrix decompositions. These methods use random sampling to identify a subspace that captures most of the action of a matrix. The input matrix is then compressed---either explicitly or implicitly---to this subspace, and the reduced matrix is manipulated deterministically to obtain the desired low-rank factorization. In many cases, this approach beats its classical competitors in terms of accuracy, speed, and robustness. These claims are supported by extensive numerical experiments and a detailed error analysis.}, + urldate = {2024-12-21}, + publisher = {arXiv}, + author = {Halko, Nathan and Martinsson, Per-Gunnar and Tropp, Joel A.}, + month = dec, + year = {2010}, + note = {arXiv:0909.4061 [math]}, + keywords = {Mathematics - Numerical Analysis, Mathematics - Probability}, +} + +@book{golyandina_singular_2018, + address = {Berlin, Heidelberg}, + series = {Use {R}!}, + title = {Singular {Spectrum} {Analysis} with {R}}, + copyright = {http://www.springer.com/tdm}, + isbn = {978-3-662-57378-5 978-3-662-57380-8}, + url = {http://link.springer.com/10.1007/978-3-662-57380-8}, + urldate = {2024-12-21}, + publisher = {Springer}, + author = {Golyandina, Nina and Korobeynikov, Anton and Zhigljavsky, Anatoly}, + year = {2018}, + doi = {10.1007/978-3-662-57380-8}, + keywords = {37M10, 68U10, forecasting, image processing, signal processing, singular spectrum analysis, singular value decomposition, time series}, +} + +@inproceedings{seabold_statsmodels_2010, + title = {statsmodels: {Econometric} and statistical modeling with python}, + booktitle = {9th {Python} in {Science} {Conference}}, + author = {Seabold, Skipper and Perktold, Josef}, + year = {2010}, +} + +@inproceedings{mckinney_data_2010, + title = {Data {Structures} for {Statistical} {Computing} in {Python}}, + doi = {10.25080/Majora-92bf1922-00a}, + booktitle = {Proceedings of the 9th {Python} in {Science} {Conference}}, + author = {McKinney, Wes}, + editor = {Walt, Stéfan van der and Millman, Jarrod}, + year = {2010}, + pages = {56 -- 61}, +} + +@article{harris_array_2020, + title = {Array programming with {NumPy}}, + volume = {585}, + url = {https://doi.org/10.1038/s41586-020-2649-2}, + doi = {10.1038/s41586-020-2649-2}, + number = {7825}, + journal = {Nature}, + author = {Harris, Charles R. and Millman, K. Jarrod and Walt, Stéfan J. van der and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and Kerkwijk, Marten H. van and Brett, Matthew and Haldane, Allan and Río, Jaime Fernández del and Wiebe, Mark and Peterson, Pearu and Gérard-Marchant, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.}, + month = sep, + year = {2020}, + note = {Publisher: Springer Science and Business Media LLC}, + pages = {357--362}, +} + +@article{virtanen_scipy_2020, + title = {{SciPy} 1.0: {Fundamental} {Algorithms} for {Scientific} {Computing} in {Python}}, + volume = {17}, + doi = {10.1038/s41592-019-0686-2}, + journal = {Nature Methods}, + author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and Haberland, Matt and Reddy, Tyler and Cournapeau, David and Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and Bright, Jonathan and van der Walt, Stéfan J. and Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and Kern, Robert and Larson, Eric and Carey, C J and Polat, İlhan and Feng, Yu and Moore, Eric W. and VanderPlas, Jake and Laxalde, Denis and Perktold, Josef and Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and Harris, Charles R. and Archibald, Anne M. and Ribeiro, Antônio H. and Pedregosa, Fabian and van Mulbregt, Paul and {SciPy 1.0 Contributors}}, + year = {2020}, + pages = {261--272}, +} + +@article{pedregosa_scikit-learn_2011, + title = {Scikit-learn: {Machine} {Learning} in {Python}}, + volume = {12}, + journal = {Journal of Machine Learning Research}, + author = {Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + year = {2011}, + pages = {2825--2830}, +} + +@book{durbin_time_2012, + title = {Time {Series} {Analysis} by {State} {Space} {Methods}}, + isbn = {978-0-19-964117-8}, + url = {https://doi.org/10.1093/acprof:oso/9780199641178.001.0001}, + abstract = {This book presents a comprehensive treatment of the state space approach to time series analysis. The distinguishing feature of state space time series models is that observations are regarded as being made up of distinct components such as trend, seasonal, regression elements and disturbance elements, each of which is modelled separately. The techniques that emerge from this approach are very flexible. Part I presents a full treatment of the construction and analysis of linear Gaussian state space models. The methods are based on the Kalman filter and are appropriate for a wide range of problems in practical time series analysis. The analysis can be carried out from both classical and Bayesian perspectives. Part I then presents illustrations to real series and exercises are provided for a selection of chapters. Part II discusses approximate and exact approaches for handling broad classes of non-Gaussian and nonlinear state space models. Approximate methods include the extended Kalman filter and the more recently developed unscented Kalman filter. The book shows that exact treatments become feasible when simulation-based methods such as importance sampling and particle filtering are adopted. Bayesian treatments based on simulation methods are also explored.}, + publisher = {Oxford University Press}, + author = {Durbin, James and Koopman, Siem Jan}, + month = may, + year = {2012}, + doi = {10.1093/acprof:oso/9780199641178.001.0001}, + doi = {10.1093/acprof:oso/9780199641178.001.0001}, +} diff --git a/paper.md b/paper.md new file mode 100644 index 0000000..970b759 --- /dev/null +++ b/paper.md @@ -0,0 +1,112 @@ +--- +title: 'SSALib: a Python Library for Timeseries Decomposition using Singular Spectrum Analysis' +tags: + - Python + - time series + - singular spectrum analysis + - singular value decomposition + - time series decomposition +authors: + - name: Damien Delforge + orcid: 0000-0002-3552-9444 + corresponding: true + affiliation: "1, 2" + - name: Alice Alonso + orcid: 0000-0001-8869-6759 + affiliation: "1, 3" + - name: Niko Speybroeck + affiliation: "2" + +affiliations: + - name: AD Scientific Consulting & Environmental Systems Analytics (ADSCIAN), Brussels, Belgium. + index: 1 + - name: University of Louvain (UCLouvain), Institute of Health & Society, Brussels, Belgium. + index: 2 + ror: 02495e989 + - name: University of Louvain (UCLouvain), Earth & Life Institute, Louvain-la-Neuve, Belgium. + index: 3 + ror: 02495e989 +date: 21 December 2024 +bibliography: paper.bib +--- + +# Summary & Statement of Needs + +Singular Spectrum Analysis (SSA) is a method developed in the 1980s for +analyzing and decomposing time-series data +[@broomhead_extracting_1986, @vautard_singular_1989]. Using time-delayed +trajectories or covariance matrices, SSA takes advantage of temporal +dependencies to identify structured components such as trends and cycles +[@elsner_singular_1996, golyandina_singular_2020]. Time-series decomposition +has various applications, including denoising, filtering, signal modeling, +interpolation (or gap filling), and extrapolation (or forecasting). + +SSA is a non-parametric method that allows for the decomposition and analysis of +time series without prior knowledge of their underlying dynamics. Another +advantage of SSA is the ability to extract nonlinear trends and phase- or +amplitude-modulated cycles. The Singular Spectrum Analysis Library (`ssalib`) +is a Python package that simplifies SSA implementation and visualization +through an easy-to-use API, operating time series as `numpy.Array` +[@harris_array_2020] or `pandas.Series` [mckinney_data_2010] objects, and +requiring minimal knowledge of linear algebra. It uses decomposition algorithms +from robust Python scientific packages like `numpy` [@harris_array_2020], +`scipy` [@virtanen_scipy_2020], and `sklearn` [@pedregosa_scikit-learn_2011]. +SSALib also incorporates the Monte Carlo SSA approach [@allen_monte_1996] for +identifying significant components by comparison to randomly generated data +(i.e., surogate data), relying on `statsmodels` [@seabold_statsmodels_2010] for +fitting autoregressive processes and generate the surrogate data. + +The basic Singular Spectrum Analysis (SSA) algorithm for univariate time series, +as described by @broomhead_extracting_1986 and @vautard_singular_1989, requires +a linear algebra library and only a few lines of code for implementation. +However, developing dedicated SSA software offers several advantages. As time +series analysis has become increasingly common across various fields, SSA +software needs to address the needs of a broader audience focused on practical +applications rather than just technical implementation. In addition, SSA has +evolved beyond being a single method and has transformed into a modular +analytical framework, consisting of interchangeable steps that can be combined +into multiple variants. Consequently, both experts and newcomers would benefit +from SSA software that allows for configurable analyses, saving time in the +process. Moreover, SSA empirical nature relies heavily on data visualization. +This makes the implementation of software essential for providing users with +established visualization features. + +@golyandina_singular_2020 mention some existing software dedicated to +SSA, such as the GUI-based SSA-MTM toolkit, Caterpillar-SSA software, and the +rSSA R package. In Python, most SSA implementations are basic and part of large +software packages, including `pyts` [@faouzi_pyts_2020], `pyleoclim` +[@khider_pyleoclim_2023], or `pyactigraphy` [@hammad_pyactigraphy_2024], or are +available primarily as unmaintained and untested projects. To address this gap, +`ssalib` was developed as a fully dedicated SSA Python package +that is both tested and suitable for teaching and research purposes. + +# Technical Details + +The Singular Spectrum Analysis (SSA) approach consists of three major steps +[@golyandina_singular_2020]: (1) Time-Delayed Matrix Construction, (2) +Matrix Decomposition, and (3) Components Grouping and Reconstruction. +The `ssalib` implements the approaches of @broomhead_extracting_1986, referred +to as BK, and @vautard_singular_1989, referred to as VG. These variants differ +in the matrices they utilize during the first step. The BK approach is based on +a time-delayed trajectory matrix with dimensions depending on the window +parameter and the number of unit lags. This matrix consists of lagged copies of +time series segments of a specified length, forming a Hankel matrix where the +anti-diagonal values are equal. In contrast, the VG approach captures time +dependencies by constructing a special type of covariance matrix that has a +Toeplitz structure, meaning that its diagonal values are identical. + +Regarding Step 2, `ssalib` relies on Singular Value Decomposition (SVD) with +methods implemented in the NumPy, SciPy, and Scikit-learn libraries. In +particular, scikit-learn features a randomized SVD algorithm for efficient +decomposition [halko_finding_2010]. Step 3 involves visualizations created with +Matplotlib, drawing inspiration from the R rSSA package +[@golyandina_singular_2018]. + +Significance testing is based on the work of Allen and Smith (1996). In +`ssalib`, an autoregressive (AR) process of a specified maximum order is fitted +relying on a state space modeling framework [@durbin_time_2012] and utilizing +the `statsmodels` library [@seabold_statsmodels_2010]. The AR random surrogates +are also generated using statsmodels, and their time-delayed matrices are +projected onto the singular system of the original time series. This comparison +of the original distribution of singular values with the many random +projections allows for the inference of significance. \ No newline at end of file