% File src/library/stats/man/optim.Rd % Part of the R package, https://www.R-project.org % Copyright 1995-2021 R Core Team % Distributed under GPL 2 or later \name{optim} \alias{optim} \alias{optimHess} \concept{minimization} \concept{maximization} \title{General-purpose Optimization} \description{ General-purpose optimization based on \I{Nelder}--\I{Mead}, quasi-Newton and conjugate-gradient algorithms. It includes an option for box-constrained optimization and simulated annealing. } \usage{ optim(par, fn, gr = NULL, \dots, method = c("Nelder-Mead", "BFGS", "CG", "L-BFGS-B", "SANN", "Brent"), lower = -Inf, upper = Inf, control = list(), hessian = FALSE) optimHess(par, fn, gr = NULL, \dots, control = list()) } \arguments{ \item{par}{Initial values for the parameters to be optimized over.} \item{fn}{A function to be minimized (or maximized), with first argument the vector of parameters over which minimization is to take place. It should return a scalar result.} \item{gr}{A function to return the gradient for the \code{"BFGS"}, \code{"CG"} and \code{"L-BFGS-B"} methods. If it is \code{NULL}, a finite-difference approximation will be used. For the \code{"SANN"} method it specifies a function to generate a new candidate point. If it is \code{NULL} a default Gaussian Markov kernel is used.} \item{\dots}{Further arguments to be passed to \code{fn} and \code{gr}.} \item{method}{The method to be used. See \sQuote{Details}. Can be abbreviated.} \item{lower, upper}{Bounds on the variables for the \code{"L-BFGS-B"} method, or bounds in which to \emph{search} for method \code{"Brent"}.} \item{control}{a \code{\link{list}} of control parameters. See \sQuote{Details}.} \item{hessian}{Logical. Should a numerically differentiated Hessian matrix be returned?} } \details{ Note that arguments after \code{\dots} must be matched exactly. By default \code{optim} performs minimization, but it will maximize if \code{control$fnscale} is negative. \code{optimHess} is an auxiliary function to compute the Hessian at a later stage if \code{hessian = TRUE} was forgotten. The default method is an implementation of that of \bibcite{Nelder and Mead (1965)}, that uses only function values and is robust but relatively slow. It will work reasonably well for non-differentiable functions. Method \code{"BFGS"} is a quasi-Newton method (also known as a variable metric algorithm), specifically that published simultaneously in 1970 by \I{Broyden}, \I{Fletcher}, \I{Goldfarb} and \I{Shanno}. This uses function values and gradients to build up a picture of the surface to be optimized. Method \code{"CG"} is a conjugate gradients method based on that by \bibcite{Fletcher and Reeves (1964)} (but with the option of \I{Polak}--\I{Ribiere} or \I{Beale}--\I{Sorenson} updates). Conjugate gradient methods will generally be more fragile than the BFGS method, but as they do not store a matrix they may be successful in much larger optimization problems. Method \code{"L-BFGS-B"} is that of Byrd \abbr{et al.}\sspace(1995) which allows \emph{box constraints}, that is each variable can be given a lower and/or upper bound. The initial value must satisfy the constraints. This uses a limited-memory modification of the BFGS quasi-Newton method. If non-trivial bounds are supplied, this method will be selected, with a warning. \bibcite{Nocedal and Wright (1999)} is a comprehensive reference for the previous three methods. Method \code{"SANN"} is by default a variant of simulated annealing given in \bibcite{Belisle (1992)}. Simulated-annealing belongs to the class of stochastic global optimization methods. It uses only function values but is relatively slow. It will also work for non-differentiable functions. This implementation uses the Metropolis function for the acceptance probability. By default the next candidate point is generated from a Gaussian Markov kernel with scale proportional to the actual temperature. If a function to generate a new candidate point is given, method \code{"SANN"} can also be used to solve combinatorial optimization problems. Temperatures are decreased according to the logarithmic cooling schedule as given in \bibcite{Belisle (1992, p.\sspace{}890)}; specifically, the temperature is set to \code{temp / log(((t-1) \%/\% tmax)*tmax + exp(1))}, where \code{t} is the current iteration step and \code{temp} and \code{tmax} are specifiable via \code{control}, see below. Note that the \code{"SANN"} method depends critically on the settings of the control parameters. It is not a general-purpose method but can be very useful in getting to a good value on a very rough surface. Method \code{"Brent"} is for one-dimensional problems only, using \code{\link{optimize}()}. It can be useful in cases where \code{optim()} is used inside other functions where only \code{method} can be specified, such as in \code{\link{mle}} from package \pkg{stats4}. Function \code{fn} can return \code{NA} or \code{Inf} if the function cannot be evaluated at the supplied value, but the initial value must have a computable finite value of \code{fn}. (Except for method \code{"L-BFGS-B"} where the values should always be finite.) \code{optim} can be used recursively, and for a single parameter as well as many. It also accepts a zero-length \code{par}, and just evaluates the function with that argument. The \code{control} argument is a list that can supply any of the following components: \describe{ \item{\code{trace}}{Non-negative integer. If positive, tracing information on the progress of the optimization is produced. Higher values may produce more tracing information: for method \code{"L-BFGS-B"} there are six levels of tracing. (To understand exactly what these do see the source code: higher levels give more detail.)} \item{\code{fnscale}}{An overall scaling to be applied to the value of \code{fn} and \code{gr} during optimization. If negative, turns the problem into a maximization problem. Optimization is performed on \code{fn(par)/fnscale}.} \item{\code{parscale}}{A vector of scaling values for the parameters. Optimization is performed on \code{par/parscale} and these should be comparable in the sense that a unit change in any element produces about a unit change in the scaled value. Not used (nor needed) for \code{method = "Brent"}.} \item{\code{ndeps}}{A vector of step sizes for the finite-difference approximation to the gradient, on \code{par/parscale} scale. Defaults to \code{1e-3}.} \item{\code{maxit}}{The maximum number of iterations. Defaults to \code{100} for the derivative-based methods, and \code{500} for \code{"Nelder-Mead"}. For \code{"SANN"} \code{maxit} gives the total number of function evaluations: there is no other stopping criterion. Defaults to \code{10000}. } \item{\code{abstol}}{The absolute convergence tolerance. Only useful for non-negative functions, as a tolerance for reaching zero.} \item{\code{reltol}}{Relative convergence tolerance. The algorithm stops if it is unable to reduce the value by a factor of \code{reltol * (abs(val) + reltol)} at a step. Defaults to \code{sqrt(.Machine$double.eps)}, typically about \code{1e-8}.} \item{\code{alpha}, \code{beta}, \code{gamma}}{Scaling parameters for the \code{"Nelder-Mead"} method. \code{alpha} is the reflection factor (default 1.0), \code{beta} the contraction factor (0.5) and \code{gamma} the expansion factor (2.0).} \item{\code{REPORT}}{The frequency of reports for the \code{"BFGS"}, \code{"L-BFGS-B"} and \code{"SANN"} methods if \code{control$trace} is positive. Defaults to every 10 iterations for \code{"BFGS"} and \code{"L-BFGS-B"}, or every 100 temperatures for \code{"SANN"}.} \item{\code{warn.1d.NelderMead}}{a \code{\link{logical}} indicating if the (default) \code{"Nelder-Mead"} method should signal a warning when used for one-dimensional minimization. As the warning is sometimes inappropriate, you can suppress it by setting this option to false.} \item{\code{type}}{for the conjugate-gradients method. Takes value \code{1} for the Fletcher--Reeves update, \code{2} for \I{Polak}--\I{Ribiere} and \code{3} for \I{Beale}--\I{Sorenson}.} \item{\code{lmm}}{is an integer giving the number of BFGS updates retained in the \code{"L-BFGS-B"} method, It defaults to \code{5}.} \item{\code{factr}}{controls the convergence of the \code{"L-BFGS-B"} method. Convergence occurs when the reduction in the objective is within this factor of the machine tolerance. Default is \code{1e7}, that is a tolerance of about \code{1e-8}.} \item{\code{pgtol}}{helps control the convergence of the \code{"L-BFGS-B"} method. It is a tolerance on the projected gradient in the current search direction. This defaults to zero, when the check is suppressed.} \item{\code{temp}}{controls the \code{"SANN"} method. It is the starting temperature for the cooling schedule. Defaults to \code{10}.} \item{\code{tmax}}{is the number of function evaluations at each temperature for the \code{"SANN"} method. Defaults to \code{10}.} } Any names given to \code{par} will be copied to the vectors passed to \code{fn} and \code{gr}. Note that no other attributes of \code{par} are copied over. The parameter vector passed to \code{fn} has special semantics and may be shared between calls: the function should not change or copy it. } %% when numerical derivatives are used, fn is called repeatedly with %% modified copies of the same objet. \value{ For \code{optim}, a list with components: \item{par}{The best set of parameters found.} \item{value}{The value of \code{fn} corresponding to \code{par}.} \item{counts}{A two-element integer vector giving the number of calls to \code{fn} and \code{gr} respectively. This excludes those calls needed to compute the Hessian, if requested, and any calls to \code{fn} to compute a finite-difference approximation to the gradient.} \item{convergence}{An integer code. \code{0} indicates successful completion (which is always the case for \code{"SANN"} and \code{"Brent"}). Possible error codes are \describe{ \item{\code{1}}{indicates that the iteration limit \code{maxit} had been reached.} \item{\code{10}}{indicates degeneracy of the \I{Nelder}--\I{Mead} simplex.} \item{\code{51}}{indicates a warning from the \code{"L-BFGS-B"} method; see component \code{message} for further details.} \item{\code{52}}{indicates an error from the \code{"L-BFGS-B"} method; see component \code{message} for further details.} } } \item{message}{A character string giving any additional information returned by the optimizer, or \code{NULL}.} \item{hessian}{Only if argument \code{hessian} is true. A symmetric matrix giving an estimate of the Hessian at the solution found. Note that this is the Hessian of the unconstrained problem even if the box constraints are active.} For \code{optimHess}, the description of the \code{hessian} component applies. } \note{ \code{optim} will work with one-dimensional \code{par}s, but the default method does not work well (and will warn). Method \code{"Brent"} uses \code{\link{optimize}} and needs bounds to be available; \code{"BFGS"} often works well enough if not. } \source{ The code for methods \code{"Nelder-Mead"}, \code{"BFGS"} and \code{"CG"} was based originally on Pascal code in Nash (1990) that was translated by \code{p2c} and then hand-optimized. Dr Nash has agreed that the code can be made freely available. The code for method \code{"L-BFGS-B"} is based on Fortran code by Zhu, Byrd, Lu-Chen and Nocedal obtained from Netlib (file \file{opt/lbfgs_bcm.shar}: another version is in \file{toms/778}). The code for method \code{"SANN"} was contributed by A. Trapletti. } \references{ Belisle, C. J. P. (1992). Convergence theorems for a class of simulated annealing algorithms on \eqn{R^d}{Rd}. \emph{Journal of Applied Probability}, \bold{29}, 885--895. \doi{10.2307/3214721}. Byrd, R. H., Lu, P., Nocedal, J. and Zhu, C. (1995). A limited memory algorithm for bound constrained optimization. \emph{SIAM Journal on Scientific Computing}, \bold{16}, 1190--1208. \doi{10.1137/0916069}. Fletcher, R. and Reeves, C. M. (1964). Function minimization by conjugate gradients. \emph{Computer Journal} \bold{7}, 148--154. \doi{10.1093/comjnl/7.2.149}. Nash, J. C. (1990). \emph{Compact Numerical Methods for Computers. Linear Algebra and Function Minimisation}. Adam Hilger. Nelder, J. A. and Mead, R. (1965). A simplex algorithm for function minimization. \emph{Computer Journal}, \bold{7}, 308--313. \doi{10.1093/comjnl/7.4.308}. Nocedal, J. and Wright, S. J. (1999). \emph{Numerical Optimization}. Springer. } \seealso{ \code{\link{nlm}}, \code{\link{nlminb}}. \code{\link{optimize}} for one-dimensional minimization and \code{\link{constrOptim}} for constrained optimization. } \examples{\donttest{ require(graphics) fr <- function(x) { ## Rosenbrock Banana function x1 <- x[1] x2 <- x[2] 100 * (x2 - x1 * x1)^2 + (1 - x1)^2 } grr <- function(x) { ## Gradient of 'fr' x1 <- x[1] x2 <- x[2] c(-400 * x1 * (x2 - x1 * x1) - 2 * (1 - x1), 200 * (x2 - x1 * x1)) } optim(c(-1.2,1), fr) (res <- optim(c(-1.2,1), fr, grr, method = "BFGS")) optimHess(res$par, fr, grr) optim(c(-1.2,1), fr, NULL, method = "BFGS", hessian = TRUE) ## These do not converge in the default number of steps optim(c(-1.2,1), fr, grr, method = "CG") optim(c(-1.2,1), fr, grr, method = "CG", control = list(type = 2)) optim(c(-1.2,1), fr, grr, method = "L-BFGS-B") flb <- function(x) { p <- length(x); sum(c(1, rep(4, p-1)) * (x - c(1, x[-p])^2)^2) } ## 25-dimensional box constrained optim(rep(3, 25), flb, NULL, method = "L-BFGS-B", lower = rep(2, 25), upper = rep(4, 25)) # par[24] is *not* at boundary ## "wild" function , global minimum at about -15.81515 fw <- function (x) 10*sin(0.3*x)*sin(1.3*x^2) + 0.00001*x^4 + 0.2*x+80 plot(fw, -50, 50, n = 1000, main = "optim() minimising 'wild function'") res <- optim(50, fw, method = "SANN", control = list(maxit = 20000, temp = 20, parscale = 20)) res ## Now improve locally {typically only by a small bit}: (r2 <- optim(res$par, fw, method = "BFGS")) points(r2$par, r2$value, pch = 8, col = "red", cex = 2) ## Combinatorial optimization: Traveling salesman problem library(stats) # normally loaded eurodistmat <- as.matrix(eurodist) distance <- function(sq) { # Target function sq2 <- embed(sq, 2) sum(eurodistmat[cbind(sq2[,2], sq2[,1])]) } genseq <- function(sq) { # Generate new candidate sequence idx <- seq(2, NROW(eurodistmat)-1) changepoints <- sample(idx, size = 2, replace = FALSE) tmp <- sq[changepoints[1]] sq[changepoints[1]] <- sq[changepoints[2]] sq[changepoints[2]] <- tmp sq } sq <- c(1:nrow(eurodistmat), 1) # Initial sequence: alphabetic distance(sq) # rotate for conventional orientation loc <- -cmdscale(eurodist, add = TRUE)$points x <- loc[,1]; y <- loc[,2] s <- seq_len(nrow(eurodistmat)) tspinit <- loc[sq,] plot(x, y, type = "n", asp = 1, xlab = "", ylab = "", main = "initial solution of traveling salesman problem", axes = FALSE) arrows(tspinit[s,1], tspinit[s,2], tspinit[s+1,1], tspinit[s+1,2], angle = 10, col = "green") text(x, y, labels(eurodist), cex = 0.8) set.seed(123) # chosen to get a good soln relatively quickly res <- optim(sq, distance, genseq, method = "SANN", control = list(maxit = 30000, temp = 2000, trace = TRUE, REPORT = 500)) res # Near optimum distance around 12842 tspres <- loc[res$par,] plot(x, y, type = "n", asp = 1, xlab = "", ylab = "", main = "optim() 'solving' traveling salesman problem", axes = FALSE) arrows(tspres[s,1], tspres[s,2], tspres[s+1,1], tspres[s+1,2], angle = 10, col = "red") text(x, y, labels(eurodist), cex = 0.8) ## 1-D minimization: "Brent" or optimize() being preferred.. but NM may be ok and "unavoidable", ## ---------------- so we can suppress the check+warning : system.time(rO <- optimize(function(x) (x-pi)^2, c(0, 10))) system.time(ro <- optim(1, function(x) (x-pi)^2, control=list(warn.1d.NelderMead = FALSE))) rO$minimum - pi # 0 (perfect), on one platform ro$par - pi # ~= 1.9e-4 on one platform utils::str(ro) }} \keyword{nonlinear} \keyword{optimize}