% File src/library/stats/man/dist.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2022 R Core Team
% Distributed under GPL 2 or later

\name{dist}
\alias{dist}
\alias{print.dist}
\alias{format.dist}
\alias{labels.dist}
\alias{as.matrix.dist}
\alias{as.dist}
\alias{as.dist.default}
\concept{dissimilarity}
\title{Distance Matrix Computation}
\description{
  This function computes and returns the distance matrix computed by
  using the specified distance measure to compute the distances between
  the rows of a data matrix.
}
\usage{
dist(x, method = "euclidean", diag = FALSE, upper = FALSE, p = 2)

as.dist(m, diag = FALSE, upper = FALSE)
\method{as.dist}{default}(m, diag = FALSE, upper = FALSE)

\method{print}{dist}(x, diag = NULL, upper = NULL,
      digits = getOption("digits"), justify = "none",
      right = TRUE, \dots)

\method{as.matrix}{dist}(x, \dots)
}
\arguments{
  \item{x}{a numeric matrix, data frame or \code{"dist"} object.}
  \item{method}{the distance measure to be used.  This must be one of
    \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
    \code{"canberra"}, \code{"binary"} or \code{"minkowski"}.
    Any unambiguous substring can be given.}
  \item{diag}{logical value indicating whether the diagonal of the
    distance matrix should be printed by \code{print.dist}.}
  \item{upper}{logical value indicating whether the upper triangle of the
    distance matrix should be printed by \code{print.dist}.}
  \item{p}{The power of the \I{Minkowski} distance.}
  \item{m}{An object with distance information to be converted to a
    \code{"dist"} object.  For the default method, a \code{"dist"}
    object, or a matrix (of distances) or an object which can be coerced
    to such a matrix using \code{\link{as.matrix}()}.  (Only the lower
    triangle of the matrix is used, the rest is ignored).}
  \item{digits, justify}{passed to \code{\link{format}} inside of
    \code{print()}.}
  \item{right, \dots}{further arguments, passed to other methods.}
}
\details{
  Available distance measures are (written for two vectors \eqn{x} and
  \eqn{y}):
  \describe{
    \item{\code{euclidean}:}{Usual distance between the two vectors (2
      norm aka \eqn{L_2}), \eqn{\sqrt{\sum_i (x_i - y_i)^2}}{sqrt(sum((x_i - y_i)^2))}.}

    \item{\code{maximum}:}{Maximum distance between two components of \eqn{x}
      and \eqn{y} (supremum norm)}

    \item{\code{manhattan}:}{Absolute distance between the two vectors (1 norm aka \eqn{L_1}).}

    \item{\code{canberra}:}{
      %% till 2017-07-15: \eqn{\sum_i |x_i - y_i| / |x_i + y_i|}{sum(|x_i - y_i| / |x_i + y_i|)}.
      \eqn{\sum_i |x_i - y_i| / (|x_i| + |y_i|)}{sum(|x_i - y_i| / (|x_i| + |y_i|))}.
      Terms with zero numerator and denominator are omitted from the sum
      and treated as if the values were missing.

      This is intended for non-negative values (e.g., counts), in which
      case the denominator can be written in various equivalent ways;
      Originally, \R used \eqn{x_i + y_i}, then from 1998 to 2017,
      \eqn{|x_i + y_i|}, and then the correct \eqn{|x_i| + |y_i|}.
    }

    \item{\code{binary}:}{(aka \emph{asymmetric binary}): The vectors
      are regarded as binary bits, so non-zero elements are \sQuote{on}
      and zero elements are \sQuote{off}.  The distance is the
      \emph{proportion} of bits in which only one is on amongst those in
      which at least one is on.
      This also called \dQuote{\I{Jaccard}} distance in some contexts.
      Here, two all-zero observations have distance \code{0}, whereas in
      traditional \I{Jaccard} definitions, the distance would be undefined for
      that case and give \code{\link{NaN}} numerically.}

    \item{\code{minkowski}:}{The \eqn{p} norm, the \eqn{p}-th root of the
      sum of the \eqn{p}-th powers of the differences of the components.}
  }

  Missing values are allowed, and are excluded from all computations
  involving the rows within which they occur.
  Further, when \code{Inf} values are involved, all pairs of values are
  excluded when their contribution to the distance gave \code{NaN} or
  \code{NA}.
  If some columns are excluded in calculating a Euclidean, Manhattan,
  Canberra or \I{Minkowski} distance, the sum is scaled up proportionally to
  the number of columns used.  If all pairs are excluded when
  calculating a particular distance, the value is \code{NA}.

  The \code{"dist"} method of \code{as.matrix()} and \code{as.dist()}
  can be used for conversion between objects of class \code{"dist"}
  and conventional distance matrices.

  \code{as.dist()} is a generic function.  Its default method handles
  objects inheriting from class \code{"dist"}, or coercible to matrices
  using \code{\link{as.matrix}()}.  Support for classes representing
  distances (also known as dissimilarities) can be added by providing an
  \code{\link{as.matrix}()} or, more directly, an \code{as.dist} method
  for such a class.
}
\value{
  \code{dist} returns an object of class \code{"dist"}.

  The lower triangle of the distance matrix stored by columns in a
  vector, say \code{do}. If \code{n} is the number of
  observations, i.e., \code{n <- attr(do, "Size")}, then
  for \eqn{i < j \le n}, the dissimilarity between (row) i and j is
  \code{do[n*(i-1) - i*(i-1)/2 + j-i]}.
  The length of the vector is \eqn{n*(n-1)/2}, i.e., of order \eqn{n^2}.

  The object has the following attributes (besides \code{"class"} equal
  to \code{"dist"}):
  \item{Size}{integer, the number of observations in the dataset.}
  \item{Labels}{optionally, contains the labels, if any, of the
    observations of the dataset.}
  \item{Diag, Upper}{logicals corresponding to the arguments \code{diag}
    and \code{upper} above, specifying how the object should be printed.}
  \item{call}{optionally, the \code{\link{call}} used to create the
    object.}
  \item{method}{optionally, the distance method used; resulting from
    \code{\link{dist}()}, the (\code{\link{match.arg}()}ed) \code{method}
    argument.}
}
\references{
  Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
  \emph{The New S Language}.
  Wadsworth & Brooks/Cole.

  Mardia, K. V., Kent, J. T. and Bibby, J. M. (1979)
  \emph{Multivariate Analysis.} Academic Press.

  Borg, I. and Groenen, P. (1997)
  \emph{Modern Multidimensional Scaling.  Theory and Applications.}
  Springer.
}
\seealso{
  \code{\link[cluster]{daisy}} in the \CRANpkg{cluster} package with more
  possibilities in the case of \emph{mixed} (continuous / categorical)
  variables.
  \code{\link{hclust}}.
}
\examples{
require(graphics)

x <- matrix(rnorm(100), nrow = 5)
dist(x)
dist(x, diag = TRUE)
dist(x, upper = TRUE)
m <- as.matrix(dist(x))
d <- as.dist(m)
stopifnot(d == dist(x))

## Use correlations between variables "as distance"
dd <- as.dist((1 - cor(USJudgeRatings))/2)
round(1000 * dd) # (prints more nicely)
plot(hclust(dd)) # to see a dendrogram of clustered variables

## example of binary and canberra distances.
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
dist(rbind(x, y), method = "binary")
## answer 0.4 = 2/5
dist(rbind(x, y), method = "canberra")
## answer 2 * (6/5)

## To find the names
labels(eurodist)

## Examples involving "Inf" :
## 1)
x[6] <- Inf
(m2 <- rbind(x, y))
dist(m2, method = "binary")   # warning, answer 0.5 = 2/4
## These all give "Inf":
stopifnot(Inf == dist(m2, method =  "euclidean"),
          Inf == dist(m2, method =  "maximum"),
          Inf == dist(m2, method =  "manhattan"))
##  "Inf" is same as very large number:
x1 <- x; x1[6] <- 1e100
stopifnot(dist(cbind(x, y), method = "canberra") ==
    print(dist(cbind(x1, y), method = "canberra")))

## 2)
y[6] <- Inf #-> 6-th pair is excluded
dist(rbind(x, y), method = "binary"  )   # warning; 0.5
dist(rbind(x, y), method = "canberra"  ) # 3
dist(rbind(x, y), method = "maximum")    # 1
dist(rbind(x, y), method = "manhattan")  # 2.4
}
\keyword{multivariate}
\keyword{cluster}