/* * R : A Computer Language for Statistical Data Analysis * Copyright (C) 1998-2017 The R Core Team * Copyright (C) 2002-2017 The R Foundation * Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, a copy is available at * https://www.R-project.org/Licenses/ */ #ifdef HAVE_CONFIG_H # include #endif #include #include #include #include "stats.h" #ifdef _OPENMP # include #endif #define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b)) #ifdef R_160_and_older #define both_non_NA both_FINITE #else #define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b)) #endif static double R_euclidean(double *x, int nr, int nc, int i1, int i2) { double dev, dist; int count, j; count= 0; dist = 0; for(j = 0 ; j < nc ; j++) { if(both_non_NA(x[i1], x[i2])) { dev = (x[i1] - x[i2]); if(!ISNAN(dev)) { dist += dev * dev; count++; } } i1 += nr; i2 += nr; } if(count == 0) return NA_REAL; if(count != nc) dist /= ((double)count/nc); return sqrt(dist); } static double R_maximum(double *x, int nr, int nc, int i1, int i2) { double dev, dist; int count, j; count = 0; dist = -DBL_MAX; for(j = 0 ; j < nc ; j++) { if(both_non_NA(x[i1], x[i2])) { dev = fabs(x[i1] - x[i2]); if(!ISNAN(dev)) { if(dev > dist) dist = dev; count++; } } i1 += nr; i2 += nr; } if(count == 0) return NA_REAL; return dist; } static double R_manhattan(double *x, int nr, int nc, int i1, int i2) { double dev, dist; int count, j; count = 0; dist = 0; for(j = 0 ; j < nc ; j++) { if(both_non_NA(x[i1], x[i2])) { dev = fabs(x[i1] - x[i2]); if(!ISNAN(dev)) { dist += dev; count++; } } i1 += nr; i2 += nr; } if(count == 0) return NA_REAL; if(count != nc) dist /= ((double)count/nc); return dist; } static double R_canberra(double *x, int nr, int nc, int i1, int i2) { double dev, dist, sum, diff; int count, j; count = 0; dist = 0; for(j = 0 ; j < nc ; j++) { if(both_non_NA(x[i1], x[i2])) { sum = fabs(x[i1]) + fabs(x[i2]); diff = fabs(x[i1] - x[i2]); if (sum > DBL_MIN || diff > DBL_MIN) { dev = diff/sum; if(!ISNAN(dev) || (!R_FINITE(diff) && diff == sum && /* use Inf = lim x -> oo */ (dev = 1., TRUE))) { dist += dev; count++; } } } i1 += nr; i2 += nr; } if(count == 0) return NA_REAL; if(count != nc) dist /= ((double)count/nc); return dist; } static double R_dist_binary(double *x, int nr, int nc, int i1, int i2) { int total, count, dist; int j; total = 0; count = 0; dist = 0; for(j = 0 ; j < nc ; j++) { if(both_non_NA(x[i1], x[i2])) { if(!both_FINITE(x[i1], x[i2])) { warning(_("treating non-finite values as NA")); } else { if(x[i1] != 0. || x[i2] != 0.) { count++; if( ! (x[i1] != 0. && x[i2] != 0.) ) dist++; } total++; } } i1 += nr; i2 += nr; } if(total == 0) return NA_REAL; if(count == 0) return 0; return (double) dist / count; } static double R_minkowski(double *x, int nr, int nc, int i1, int i2, double p) { double dev, dist; int count, j; count= 0; dist = 0; for(j = 0 ; j < nc ; j++) { if(both_non_NA(x[i1], x[i2])) { dev = (x[i1] - x[i2]); if(!ISNAN(dev)) { dist += R_pow(fabs(dev), p); count++; } } i1 += nr; i2 += nr; } if(count == 0) return NA_REAL; if(count != nc) dist /= ((double)count/nc); return R_pow(dist, 1.0/p); } enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY, MINKOWSKI }; /* == 1,2,..., defined by order in the R function dist */ void R_distance(double *x, int *nr, int *nc, double *d, int *diag, int *method, double *p) { int i, j = 0; // suppress clang 9 warning size_t ij; /* can exceed 2^31 - 1 */ double (*distfun)(double*, int, int, int, int) = NULL; #ifdef _OPENMP int nthreads; #endif switch(*method) { case EUCLIDEAN: distfun = R_euclidean; break; case MAXIMUM: distfun = R_maximum; break; case MANHATTAN: distfun = R_manhattan; break; case CANBERRA: distfun = R_canberra; break; case BINARY: distfun = R_dist_binary; break; case MINKOWSKI: if(!R_FINITE(*p) || *p <= 0) error(_("distance(): invalid p")); // plus special case below because of extra argument break; default: error(_("distance(): invalid distance")); } int dc = (*diag) ? 0 : 1; /* diag=1: we do the diagonal */ #ifdef _OPENMP if (R_num_math_threads > 0) nthreads = R_num_math_threads; else nthreads = 1; /* for now */ if (nthreads == 1) { /* do the nthreads == 1 case without any OMP overhead to see if it matters on some platforms */ ij = 0; for(j = 0 ; j <= *nr ; j++) for(i = j+dc ; i < *nr ; i++) d[ij++] = (*method != MINKOWSKI) ? distfun(x, *nr, *nc, i, j) : R_minkowski(x, *nr, *nc, i, j, *p); } else /* This produces uneven thread workloads since the outer loop is over the subdiagonal portions of columns. An alternative would be to use a loop on ij and to compute the i and j values from ij. */ #pragma omp parallel for num_threads(nthreads) default(none) \ private(i, j, ij) \ firstprivate(nr, dc, d, method, distfun, nc, x, p) for(j = 0 ; j <= *nr ; j++) { ij = j * (*nr - dc) + j - ((1 + j) * j) / 2; for(i = j+dc ; i < *nr ; i++) d[ij++] = (*method != MINKOWSKI) ? distfun(x, *nr, *nc, i, j) : R_minkowski(x, *nr, *nc, i, j, *p); } #else ij = 0; for(j = 0 ; j <= *nr ; j++) for(i = j+dc ; i < *nr ; i++) d[ij++] = (*method != MINKOWSKI) ? distfun(x, *nr, *nc, i, j) : R_minkowski(x, *nr, *nc, i, j, *p); #endif } #include SEXP Cdist(SEXP x, SEXP smethod, SEXP attrs, SEXP p) { SEXP ans; int nr = nrows(x), nc = ncols(x), method = asInteger(smethod); int diag = 0; R_xlen_t N; double rp = asReal(p); N = (R_xlen_t)nr * (nr-1)/2; /* avoid int overflow for N ~ 50,000 */ PROTECT(ans = allocVector(REALSXP, N)); if(TYPEOF(x) != REALSXP) x = coerceVector(x, REALSXP); PROTECT(x); R_distance(REAL(x), &nr, &nc, REAL(ans), &diag, &method, &rp); /* tack on attributes */ SEXP names = getAttrib(attrs, R_NamesSymbol); for (int i = 0; i < LENGTH(attrs); i++) setAttrib(ans, install(translateChar(STRING_ELT(names, i))), VECTOR_ELT(attrs, i)); UNPROTECT(2); return ans; }