% File src/library/stats/man/reshape.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2012 R Core Team
% Distributed under GPL 2 or later

\name{reshape}
\alias{reshape}
\title{Reshape Grouped Data}
\description{
  This function reshapes a data frame between \sQuote{wide} format (with
  repeated measurements in separate columns of the same row) and
  \sQuote{long} format (with the repeated measurements in separate
  rows).
}
\usage{
reshape(data, varying = NULL, v.names = NULL, timevar = "time",
        idvar = "id", ids = 1:NROW(data),
        times = seq_along(varying[[1]]),
        drop = NULL, direction, new.row.names = NULL,
        sep = ".",
        split = if (sep == "") {
            list(regexp = "[A-Za-z][0-9]", include = TRUE)
        } else {
            list(regexp = sep, include = FALSE, fixed = TRUE)}
        )

### Typical usage for converting from long to wide format:

# reshape(data, direction = "wide",
#         idvar = "___", timevar = "___", # mandatory
#         v.names = c(___),    # time-varying variables
#         varying = list(___)) # auto-generated if missing

### Typical usage for converting from wide to long format:

### If names of wide-format variables are in a 'nice' format

# reshape(data, direction = "long",
#         varying = c(___), # vector 
#         sep)              # to help guess 'v.names' and 'times'

### To specify long-format variable names explicitly

# reshape(data, direction = "long",
#         varying = ___,  # list / matrix / vector (use with care)
#         v.names = ___,  # vector of variable names in long format
#         timevar, times, # name / values of constructed time variable
#         idvar, ids)     # name / values of constructed id variable

}
\arguments{
  \item{data}{a data frame}
  \item{varying}{names of sets of variables in the wide format that
    correspond to single variables in long format
    (\sQuote{time-varying}).  This is canonically a list of vectors of
    variable names, but it can optionally be a matrix of names, or a
    single vector of names.  In each case, when \code{direction =
    "long"}, the names can be replaced by indices which are interpreted
    as referring to \code{names(data)}.  See \sQuote{Details} for more
    details and options.}
  \item{v.names}{names of variables in the long format that correspond
    to multiple variables in the wide format.  See \sQuote{Details}.}
  \item{timevar}{the variable in long format that differentiates multiple
    records from the same group or individual.  If more than one record
    matches, the first will be taken (with a warning). }
  \item{idvar}{Names of one or more variables in long format that
    identify multiple records from the same group/individual.  These
    variables may also be present in wide format.}
  \item{ids}{the values to use for a newly created \code{idvar}
    variable in long format.}
  \item{times}{the values to use for a newly created \code{timevar}
    variable in long format.  See \sQuote{Details}.}
  \item{drop}{a vector of names of variables to drop before reshaping.}
  \item{direction}{character string, partially matched to either
    \code{"wide"} to reshape to wide format, or \code{"long"} to reshape
    to long format.}
  \item{new.row.names}{character or \code{NULL}: a non-null value will be
    used for the row names of the result.}
  \item{sep}{A character vector of length 1, indicating a separating
    character in the variable names in the wide format.  This is used for
    guessing \code{v.names} and \code{times} arguments based on the
    names in \code{varying}.  If \code{sep == ""}, the split is just before
    the first numeral that follows an alphabetic character.  This is
    also used to create variable names when reshaping to wide format.}
  \item{split}{A list with three components, \code{regexp},
    \code{include}, and (optionally) \code{fixed}.  This allows an
    extended interface to variable name splitting.  See \sQuote{Details}.}
}
\details{
  Although \code{reshape()} can be used in a variety of contexts, the
  motivating application is data from longitudinal studies, and the
  arguments of this function are named and described in those terms. A
  longitudinal study is characterized by repeated measurements of the
  same variable(s), e.g., height and weight, on each unit being studied
  (e.g., individual persons) at different time points (which are assumed
  to be the same for all units). These variables are called time-varying
  variables. The study may include other variables that are measured
  only once for each unit and do not vary with time (e.g., gender and
  race); these are called time-constant variables.

  A \sQuote{wide} format representation of a longitudinal dataset will
  have one record (row) for each unit, typically with some time-constant
  variables that occupy single columns, and some time-varying variables
  that occupy multiple columns (one column for each time point).  A
  \sQuote{long} format representation of the same dataset will have
  multiple records (rows) for each individual, with the time-constant
  variables being constant across these records and the time-varying
  variables varying across the records.  The \sQuote{long} format
  dataset will have two additional variables: a \sQuote{time} variable
  identifying which time point each record comes from, and an
  \sQuote{id} variable showing which records refer to the same unit.

  The type of conversion (long to wide or wide to long) is determined by
  the \code{direction} argument, which is mandatory unless the
  \code{data} argument is the result of a previous call to
  \code{reshape}.  In that case, the operation can be reversed simply
  using \code{reshape(data)} (the other arguments are stored as
  attributes on the data frame).

  Conversion from long to wide format with \code{direction = "wide"} is
  the simpler operation, and is mainly useful in the context of
  multivariate analysis where data is often expected as a wide-format
  matrix. In this case, the time variable \code{timevar} and id variable
  \code{idvar} must be specified. All other variables are assumed to be
  time-varying, unless the time-varying variables are explicitly
  specified via the \code{v.names} argument.  A warning is issued if
  time-constant variables are not actually constant.

  Each time-varying variable is expanded into multiple variables in the
  wide format.  The names of these expanded variables are generated
  automatically, unless they are specified as the \code{varying}
  argument in the form of a list (or matrix) with one component (or row)
  for each time-varying variable. If \code{varying} is a vector of
  names, it is implicitly converted into a matrix, with one row for each
  time-varying variable. Use this option with care if there are multiple
  time-varying variables, as the ordering (by column, the default in the
  \code{\link{matrix}} constructor) may be unintuitive, whereas the
  explicit list or matrix form is unambiguous.

  %% (FIXME: delete?) If 'direction = "wide"' and no \code{varying} or
  %% \code{v.names} arguments are supplied it is assumed that all variables
  %% except \code{idvar} and \code{timevar} are time-varying.

  Conversion from wide to long with \code{direction = "long"} is the
  more common operation as most (univariate) statistical modeling
  functions expect data in the long format. In the simpler case where
  there is only one time-varying variable, the corresponding columns in
  the wide format input can be specified as the \code{varying} argument,
  which can be either a vector of column names or the corresponding
  column indices. The name of the corresponding variable in the long
  format output combining these columns can be optionally specified as
  the \code{v.names} argument, and the name of the time variables as the
  \code{timevar} argument. The values to use as the time values
  corresponding to the different columns in the wide format can be
  specified as the \code{times} argument.  If \code{v.names} is
  unspecified, the function will attempt to guess \code{v.names} and
  \code{times} from \code{varying} (an explicitly specified \code{times}
  argument is unused in that case).  The default expects variable names
  like \code{x.1}, \code{x.2}, where \code{sep = "."}  specifies to
  split at the dot and drop it from the name.  To have alphabetic
  followed by numeric times use \code{sep = ""}.

  Multiple time-varying variables can be specified in two ways, either
  with \code{varying} as an atomic vector as above, or as a list (or a
  matrix). The first form is useful (and mandatory) if the automatic
  variable name splitting as described above is used; this requires the
  names of all time-varying variables to be suitably formatted in the
  same manner, and \code{v.names} to be unspecified. If \code{varying}
  is a list (with one component for each time-varying variable) or a
  matrix (one row for each time-varying variable), variable name
  splitting is not attempted, and \code{v.names} and \code{times} will
  generally need to be specified, although they will default to,
  respectively, the first variable name in each set, and sequential
  times.

  Also, guessing is not attempted if \code{v.names} is given explicitly,
  even if \code{varying} is an atomic vector. In that case, the number
  of time-varying variables is taken to be the length of \code{v.names},
  and \code{varying} is implicitly converted into a matrix, with one row
  for each time-varying variable. As in the case of long to wide
  conversion, the matrix is filled up by column, so careful attention needs
  to be paid to the order of variable names (or indices) in
  \code{varying}, which is taken to be like \code{x.1}, \code{y.1},
  \code{x.2}, \code{y.2} (i.e., variables corresponding to the same time
  point need to be grouped together).

  The \code{split} argument should not usually be necessary.  The
  \code{split$regexp} component is passed to either
  \code{\link{strsplit}} or \code{\link{regexpr}}, where the latter is
  used if \code{split$include} is \code{TRUE}, in which case the
  splitting occurs after the first character of the matched string.  In
  the \code{\link{strsplit}} case, the separator is not included in the
  result, and it is possible to specify fixed-string matching using
  \code{split$fixed}.
}
\value{
  The reshaped data frame with added attributes to simplify reshaping
  back to the original form.
}
\seealso{\code{\link{stack}}, \code{\link{aperm}};
  \code{\link{relist}} for reshaping the result of
  \code{\link{unlist}}. \code{\link{xtabs}} and
  \code{\link{as.data.frame.table}} for creating contingency tables and
  converting them back to data frames.
}
\examples{
summary(Indometh) # data in long format

## long to wide (direction = "wide") requires idvar and timevar at a minimum
reshape(Indometh, direction = "wide", idvar = "Subject", timevar = "time")

## can also explicitly specify name of combined variable
wide <- reshape(Indometh, direction = "wide", idvar = "Subject",
                timevar = "time", v.names = "conc", sep= "_")
wide

## reverse transformation
reshape(wide, direction = "long")
reshape(wide, idvar = "Subject", varying = list(2:12),
        v.names = "conc", direction = "long")

## times need not be numeric
df <- data.frame(id = rep(1:4, rep(2,4)),
                 visit = I(rep(c("Before","After"), 4)),
                 x = rnorm(4), y = runif(4))
df
reshape(df, timevar = "visit", idvar = "id", direction = "wide")
## warns that y is really varying
reshape(df, timevar = "visit", idvar = "id", direction = "wide", v.names = "x")


##  unbalanced 'long' data leads to NA fill in 'wide' form
df2 <- df[1:7, ]
df2
reshape(df2, timevar = "visit", idvar = "id", direction = "wide")

## Alternative regular expressions for guessing names
df3 <- data.frame(id = 1:4, age = c(40,50,60,50), dose1 = c(1,2,1,2),
                  dose2 = c(2,1,2,1), dose4 = c(3,3,3,3))
reshape(df3, direction = "long", varying = 3:5, sep = "")


## an example that isn't longitudinal data
state.x77 <- as.data.frame(state.x77)
long <- reshape(state.x77, idvar = "state", ids = row.names(state.x77),
                times = names(state.x77), timevar = "Characteristic",
                varying = list(names(state.x77)), direction = "long")

reshape(long, direction = "wide")

reshape(long, direction = "wide", new.row.names = unique(long$state))

## multiple id variables
df3 <- data.frame(school = rep(1:3, each = 4), class = rep(9:10, 6),
                  time = rep(c(1,1,2,2), 3), score = rnorm(12))
wide <- reshape(df3, idvar = c("school", "class"), direction = "wide")
wide
## transform back
reshape(wide)

}
\keyword{manip}