% File src/library/stats/man/aggregate.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2023 R Core Team
% Distributed under GPL 2 or later

\name{aggregate}
\alias{aggregate}
\alias{aggregate.default}
\alias{aggregate.data.frame}
\alias{aggregate.formula}
\alias{aggregate.ts}
\title{Compute Summary Statistics of Data Subsets}
\usage{
aggregate(x, \dots)

\method{aggregate}{default}(x, \dots)

\method{aggregate}{data.frame}(x, by, FUN, \dots, simplify = TRUE, drop = TRUE)

\method{aggregate}{formula}(x, data, FUN, \dots,
          subset, na.action = na.omit)

\method{aggregate}{ts}(x, nfrequency = 1, FUN = sum, ndeltat = 1,
          ts.eps = getOption("ts.eps"), \dots)
}
\description{
  Splits the data into subsets, computes summary statistics for each,
  and returns the result in a convenient form.
}
\arguments{
  \item{x}{an \R object.  For the \code{formula} method a \code{\link{formula}},
    such as \code{y ~ x} or \code{cbind(y1, y2) ~ x1 + x2}, where the
    \code{y} variables are numeric data to be split into groups according
    to the grouping \code{x} variables (usually factors).}
  \item{by}{a list of grouping elements, each as long as the variables
    in the data frame \code{x}, or a formula.  The elements are coerced to factors
    before use.}
  \item{FUN}{a function to compute the summary statistics which can be
    applied to all data subsets.}
  \item{simplify}{a logical indicating whether results should be
    simplified to a vector or matrix if possible.}
  \item{drop}{a logical indicating whether to drop unused combinations
    of grouping values.  The non-default case \code{drop=FALSE} has been
    amended for \R 3.5.0 to drop unused combinations.}
  \item{data}{a data frame (or list) from which the variables in the formula
    should be taken.}
  \item{subset}{an optional vector specifying a subset of observations
    to be used.}
  \item{na.action}{a function which indicates what should happen when
    the data contain \code{NA} values. The default is to only consider
    \emph{complete cases} % != complete.cases (c2991 in R-0-63-patches)
    with respect to the given variables.}
  \item{nfrequency}{new number of observations per unit of time; must
    be a divisor of the frequency of \code{x}.}
  \item{ndeltat}{new fraction of the sampling period between
    successive observations; must be a divisor of the sampling
    interval of \code{x}.}
  \item{ts.eps}{tolerance used to decide if \code{nfrequency} is a
    sub-multiple of the original frequency.}
  \item{\dots}{further arguments passed to or used by methods.}
}
\details{
  \code{aggregate} is a generic function with methods for data frames
  and time series.

  The default method, \code{aggregate.default}, uses the time series
  method if \code{x} is a time series, and otherwise coerces \code{x}
  to a data frame and calls the data frame method.

  \code{aggregate.data.frame} is the data frame method.  If \code{x} is
  not a data frame, it is coerced to one, which must have a non-zero
  number of rows.  Then, each of the variables (columns) in \code{x} is
  split into subsets of cases (rows) of identical combinations of the
  components of \code{by}, and \code{FUN} is applied to each such subset
  with further arguments in \code{\dots} passed to it.  The result is
  reformatted into a data frame containing the variables in \code{by}
  and \code{x}.  The ones arising from \code{by} contain the unique
  combinations of grouping values used for determining the subsets, and
  the ones arising from \code{x} the corresponding summaries for the
  subset of the respective variables in \code{x}.  If \code{simplify} is
  true, summaries are simplified to vectors or matrices if they have a
  common length of one or greater than one, respectively; otherwise,
  lists of summary results according to subsets are obtained.  Rows with
  missing values in any of the \code{by} variables will be omitted from
  the result.  (Note that versions of \R prior to 2.11.0 required
  \code{FUN} to be a scalar function.)

  The formula method provides a standard formula interface to
  \code{aggregate.data.frame}.
  The latter invokes the formula method if \code{by} is a formula,
  in which case \code{aggregate(x, by, FUN)} is the same as
  \code{aggregate(by, x, FUN)} for a data frame \code{x}.

  \code{aggregate.ts} is the time series method, and requires \code{FUN}
  to be a scalar function.  If \code{x} is not a time series, it is
  coerced to one.  Then, the variables in \code{x} are split into
  appropriate blocks of length \code{frequency(x) / nfrequency}, and
  \code{FUN} is applied to each such block, with further (named)
  arguments in \code{\dots} passed to it.  The result returned is a time
  series with frequency \code{nfrequency} holding the aggregated values.
  Note that this make most sense for a quarterly or yearly result when
  the original series covers a whole number of quarters or years: in
  particular aggregating a monthly series to quarters starting in
  February does not give a conventional quarterly series.

  \code{FUN} is passed to \code{\link{match.fun}}, and hence it can be a
  function or a symbol or character string naming a function.
}
\value{
  For the time series method, a time series of class \code{"ts"} or
  class \code{c("mts", "ts")}.

  For the data frame method, a data frame with columns
  corresponding to the grouping variables in \code{by} followed by
  aggregated columns from \code{x}.  If the \code{by} has names, the
  non-empty times are used to label the columns in the results, with
  unnamed grouping variables being named \code{Group.\var{i}} for
  \code{by[[\var{i}]]}.
}

\section{Warning}{
  The first argument of the \code{"formula"} method was named
  \code{formula} rather than \code{x} prior to \R 4.2.0.  Portable uses
  should not name that argument.
}

\author{
  Kurt Hornik, with contributions by Arni Magnusson.
}
\seealso{
  \code{\link{apply}}, \code{\link{lapply}}, \code{\link{tapply}}.
}
\references{
  Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
  \emph{The New S Language}.
  Wadsworth & Brooks/Cole.
}
\examples{
## Compute the averages for the variables in 'state.x77', grouped
## according to the region (Northeast, South, North Central, West) that
## each state belongs to.
aggregate(state.x77, list(Region = state.region), mean)

## Compute the averages according to region and the occurrence of more
## than 130 days of frost.
aggregate(state.x77,
          list(Region = state.region,
               Cold = state.x77[,"Frost"] > 130),
          mean)
## (Note that no state in 'South' is THAT cold.)


## example with character variables and NAs
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
                     v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")

# and if you want to treat NAs as a group
fby1 <- factor(by1, exclude = "")
fby2 <- factor(by2, exclude = "")
aggregate(x = testDF, by = list(fby1, fby2), FUN = "mean")


## Formulas, one ~ one, one ~ many, many ~ one, and many ~ many:
aggregate(weight ~ feed, data = chickwts, mean)
aggregate(breaks ~ wool + tension, data = warpbreaks, mean)
aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, mean)
aggregate(cbind(ncases, ncontrols) ~ alcgp + tobgp, data = esoph, sum)

## "complete cases" vs. "available cases"
colSums(is.na(airquality))  # NAs in Ozone but not Temp
## the default is to summarize *complete cases*:
aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, FUN = mean)
## to handle missing values *per variable*:
aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, FUN = mean,
          na.action = na.pass, na.rm = TRUE)

## Dot notation:
aggregate(. ~ Species, data = iris, mean)
aggregate(len ~ ., data = ToothGrowth, mean)

## Often followed by xtabs():
ag <- aggregate(len ~ ., data = ToothGrowth, mean)
xtabs(len ~ ., data = ag)

## Formula interface via 'by' (for pipe operations)
ToothGrowth |> aggregate(len ~ ., FUN = mean)

## Compute the average annual approval ratings for American presidents.
aggregate(presidents, nfrequency = 1, FUN = mean)
## Give the summer less weight.
aggregate(presidents, nfrequency = 1,
          FUN = weighted.mean, w = c(1, 1, 0.5, 1))
}
\keyword{category}
\keyword{array}