Skip to content

Commit

Permalink
added seriate for data.frame
Browse files Browse the repository at this point in the history
  • Loading branch information
mhahsler committed Jun 30, 2021
1 parent 24dc16e commit 3b785a9
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 33 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: seriation
Type: Package
Title: Infrastructure for Ordering Objects Using Seriation
Version: 1.2-9.1
Date: 20xx-xx-xx
Version: 1.3.0
Date: 2021-06-29
Authors@R: c(
person("Michael", "Hahsler", role = c("aut", "cre", "cph"),
email = "mhahsler@lyle.smu.edu"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ S3method(permute, dendrogram)
S3method(seriate, dist)
S3method(seriate, matrix)
S3method(seriate, array)
S3method(seriate, data.frame)

S3method(reorder, hclust)

Expand Down
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# seriation 1.2-9.1 (00/00/0000)
# seriation 1.3-0 (06/29/2021)

## Changes
* Plotting
Expand All @@ -22,6 +22,7 @@
- colors are now more consistent and all have bias and power.
* Seriation methods
- seriate for matrix has now method "Heatmap".
- seriate now accepts data.frames and used method "heatmap" as the default.
- added seriation method "Reverse" for reverse identity order.
* Permutation
- permute for matrix-like objects gained parameter margin.
Expand Down
14 changes: 11 additions & 3 deletions R/permute.R
Original file line number Diff line number Diff line change
Expand Up @@ -138,18 +138,26 @@ permute.hclust <- function(x, order, ...) {
if (!inherits(order, "ser_permutation"))
order <- ser_permutation(order)

# DEPRECATED: Compatibility with old permutation for data.frame
if (is.data.frame(x) && is.null(margin) && length(order) == 1) {
message("permute for data.frames with a single seriation order is now deprecated. Specify the margin as follows: 'permute(x, order, margin = 1)'")
margin <- 1
}

# create complete order object for margin
if (!is.null(margin)) {
if (length(margin) != 1 || !(margin %in% seq(ndim(x))))
stop("margin needs to be a single numeric index.")
stop("margin needs to be a single integer index indicating the dimension to permute.")

margin <- as.integer(margin)

if (length(order) != 1 && length(order) != ndim(x))
stop("order needs to contain either orders for all dimensions or just a single order.")
stop("order needs to contain either orders for all dimensions or just a single order for the selected margin.")

if (length(order) == 1)
if (length(order) == 1) {
length(order) <- ndim(x)
order[[margin]] <- order[[1]]
}

# set all other dimensions to identity.
for (i in seq(ndim(x))) {
Expand Down
2 changes: 0 additions & 2 deletions R/seriate.array.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
control = NULL,
margin = seq(ndim(x)),
datatype = "array",
defmethod,
...) {
## add ... to control
control <- c(control, list(...))
Expand Down Expand Up @@ -63,7 +62,6 @@ seriate.array <- function(x,
control,
margin,
datatype = "array",
defmethod = NA,
...)
## we currently have no method and therefore also no default method!

Expand Down
31 changes: 31 additions & 0 deletions R/seriate.data.frame.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#######################################################################
# seriation - Infrastructure for seriation
# Copyright (C) 2011 Michael Hahsler, Christian Buchta and Kurt Hornik
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

## seriate data.frame

seriate.data.frame <- function(x,
method = "Heatmap",
control = NULL,
margin = c(1, 2),
...)
.seriate_array_helper(as.matrix(x),
method,
control,
margin,
datatype = "matrix",
...)
2 changes: 0 additions & 2 deletions R/seriate.matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@ seriate.matrix <- function(x,
control,
margin,
datatype = "matrix",
defmethod = "BEA_TSP",
...)


seriate_matrix_identity <- function(x, control) {
control <- .get_parameters(control, NULL)
lapply(dim(x), seq)
Expand Down
52 changes: 31 additions & 21 deletions man/seriate.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
\alias{seriate.matrix}
\alias{seriate.dist}
\alias{seriate.array}
\alias{seriate.data.frame}
\alias{LS_insert}
\alias{LS_swap}
\alias{LS_reverse}
Expand All @@ -16,39 +17,46 @@ data array (k-way k-mode data).
\usage{
\method{seriate}{dist}(x, method = "Spectral", control = NULL, \ldots)

\method{seriate}{array}(x, method = "PCA", control = NULL,
margin = seq(length(dim(x))), \ldots)

\method{seriate}{matrix}(x, method = "PCA", control = NULL,
margin = c(1,2), \ldots)

\method{seriate}{array}(x, method = "PCA", control = NULL,
margin = seq(length(dim(x))), \ldots)
\method{seriate}{data.frame}(x, method = "Heatmap", control = NULL,
margin = c(1,2), \ldots)
}
\arguments{
\item{x}{the data.}
\item{method}{ a character string with the name of the seriation method
(default: varies by data type).}
\item{control}{ a list of control options passed on to the seriation
algorithm.}
\item{margin}{ a vector giving the margins to be seriated. For matrix,
\item{margin}{ a vector giving the margin indices (dimensions) to be seriated. For example, for a matrix,
\code{1} indicates rows, \code{2} indicates columns, \code{c(1,2)}
indicates rows and columns. For array, margin gets a vector with
the dimensions to seriate.}
indicates rows and columns.}
\item{\ldots}{ further arguments are added to the \code{control} list.}
}
\details{
Seriation methods are available via a registry.
See \code{\link{list_seriation_methods}} for help.
See \code{\link{list_seriation_methods}} for help. Data frames are just a different reprentation of a
matrix and all seriation methods for matrix can be used for data frames. The default method for data frames is
heatmap seriation which calculates distances between rows and between columns and then applies seriation on these using
hierarchical clustering and optimal leaf ordering (OLO).

Many seriation methods (heuristically) optimize (minimize or maximize)
an objective function.
The value of the function for a given seriation can be calculated using
\code{\link{criterion}}. In this manual page we only state the measure
which is optimized (using \bold{bold font}).
A definition of the measures can be found in the
\code{\link{criterion}}. In this manual page, we only include the measure
which is optimized by each method using \bold{bold font}.
A definition of these measures can be found in the
\code{\link{criterion}} manual page.

Two-way two-mode data has to be provided as a dist object (not as a symmetric
matrix). Similarities have to be transformed in a suitable way into
dissimilarities. Currently the following methods are implemented for dist
\bold{Seriation methods by type of data:}

\bold{dist:} One-mode two-way data has to be provided as a dist object (not as a symmetric
matrix). Similarities have to be transformed into
dissimilarities. Currently, the following methods are implemented
(for a more detailed description and an experimental comparison see
\href{https://michael.hahsler.net/research/misc/EJOR_seriation_2016.pdf}{Hahsler (2017)}):

Expand Down Expand Up @@ -270,7 +278,8 @@ The DendSer code has to be first registered. A detailed description can be found

}

Two-way two mode data are general positive matrices.
\bold{matrix or data.frame:} Two-mode two-way data are general matrices.
Some methods also require that the matrix is positive.
Currently the following methods are implemented for matrix:
\describe{
\item{\code{"BEA"}}{Bond Energy Algorithm (BEA; McCormick 1972).
Expand Down Expand Up @@ -304,6 +313,8 @@ In \code{control} as element \code{"method"} a TSP solver method can be
specified (see package \pkg{TSP}).
}
\item{\code{"Heatmap"}}{ Heatmap seriation calculates distances between rows and between columns and then applies seriation on these using hierarchical clustering and optimal leaf ordering (method \code{"OLO"} for \code{dist}). }
\item{\code{"PCA"}, \code{"PCA_angle"}}{ Principal component analysis.
Uses the projection of the data on its first principal component to
Expand Down Expand Up @@ -342,7 +353,7 @@ Fast Optimal Leaf Ordering for Hierarchical Clustering.
Barnard, S. T., A. Pothen, and H. D. Simon (1993): A Spectral Algorithm for Envelope Reduction of Sparse Matrices.
\emph{In Proceedings of the 1993
ACM/IEEE Conference on Supercomputing,} 493--502. Supercomputing '93.
New York, NY, USA: ACM. \doi{10.1109/SUPERC.1993.1263497}
New York, NY, USA: ACM. \url{https://ieeexplore.ieee.org/document/1263497}

Bezdek, J.C. and Hathaway, R.J. (2002): VAT: a tool for visual assessment of (cluster) tendency. \emph{Proceedings of the 2002 International Joint
Conference on Neural Networks (IJCNN '02)}, Volume: 3, 2225--2230.
Expand Down Expand Up @@ -418,8 +429,7 @@ Tsafrir, D., Tsafrir, I., Ein-Dor, L., Zuk, O., Notterman, D.A. and Domany, E. (
\author{Michael Hahsler}
\examples{
# Show available seriation methods (for dist and matrix)
show_seriation_methods("dist")
show_seriation_methods("matrix")
list_seriation_methods()

# Seriate as distance matrix (for the iris dataset)
data("iris")
Expand Down Expand Up @@ -448,20 +458,20 @@ x <- as.matrix(iris[-5])
x <- scale(x, center = FALSE)

# The iris flowers are ordered by species in the data set
pimage(x, main = "original data")
pimage(x, main = "original data", prop = FALSE)
criterion(x)

# Apply some methods
order <- seriate(x, method = "BEA_TSP")
pimage(x, order, main = "TSP to optimize ME")
pimage(x, order, main = "TSP to optimize ME", prop = FALSE)
criterion(x, order)

order <- seriate(x, method = "PCA")
pimage(x, order, main = "First principal component")
pimage(x, order, main = "First principal component", prop = FALSE)
criterion(x, order)

order <- seriate(x, method = "heatmap")
pimage(x, order, main = "Heatmap seriation")
pimage(x, order, main = "Heatmap seriation", prop = FALSE)
criterion(x, order)

# create a heatmap seriation manually by calculating
Expand All @@ -470,7 +480,7 @@ order <- c(
seriate(dist(x), method = "OLO"),
seriate(dist(t(x)), method = "OLO")
)
pimage(x, order, main = "heatmap")
pimage(x, order, main = "Heatmap seriation", prop = FALSE)
criterion(x, order)
}
\keyword{optimize}
Expand Down
33 changes: 31 additions & 2 deletions tests/testthat/test-seriate.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ x <- matrix(c(
), byrow = TRUE, ncol = 5,
dimnames = list(1:4, LETTERS[1:5]))



d <- dist(x)

context("seriate_dist")
Expand Down Expand Up @@ -50,7 +48,38 @@ os <- sapply(methods, function(m) {
expect_true(all(sapply(os, length) == 2L))
expect_true(all(sapply(os, FUN = function(o2) sapply(o2, length)) == c(4L, 5L)))

x_p <- permute(x, os[[1]])
expect_equal(x_p, x[get_order(os[[1]], 1), get_order(os[[1]], 2)])

# TODO: check labels
#get_order(os$Identity, 1)
#get_order(os$Identity, 2)
#get_order(os$Reverse, 2)

context("seriate with margin")

methods <- list_seriation_methods(kind = "matrix")
os <- sapply(methods, function(m) {
cat("Doing ", m, " ... ")
tm <- system.time(o <- seriate(x, method = m, margin = 2))
cat("took ", tm[3],"s.\n")
o
}, simplify = FALSE)
expect_true(all(sapply(os, length) == 1L))
expect_true(all(sapply(os, FUN = function(o2) sapply(o2, length)) == c(5L)))

x_p <- permute(x, os[[1]], margin = 2)
expect_equal(x_p, x[, get_order(os[[1]])])

context("seriate data.frame")
df <- as.data.frame(x)
o <- seriate(df)
permute(df, o)

seriate(df, method = "PCA")

o <- seriate(df, margin = 1)
## DEPRECATED: results in a message
permute(df, o)

permute(df, o, margin = 1)

0 comments on commit 3b785a9

Please sign in to comment.