/* Clustering LARge Applications ~ ~~~ ~ Clustering program based upon the k-medoid approach, and suitable for data sets of at least 100 objects. (for smaller data sets, please use program pam.) */ /* original Id: clara.f,v 1.10 2002/08/27 15:43:58 maechler translated by * f2c (version 20010821) and run through f2c-clean,v 1.10 2002/03/28 */ #include #include /* for diagnostics */ #include /* when R's RNG is used */ #include /* for interrupting */ #include "cluster.h" #include "ind_2.h" void cl_clara(int *n, /* = number of objects */ int *jpp,/* = number of variables */ int *kk, /* = number of clusters, 1 <= kk <= n-1 */ double *x, /* Input: the data x[n, jpp] _rowwise_ (transposed) * Output: the first `n' values are the `clustering' * (integers in 1,2,..,kk) */ int *nran, /* = #{random samples} drawn (= `samples' in R)*/ int *nsam, /* = #{objects} drawn from data set (`sampsize' in R) */ double *dys,/* [1:(1 + (nsam * (nsam - 1))/2)] * Output: to contain the distances, -1. instead of NA */ int *mdata, /*= {0,1}; 1: min(x) is missing value (NA); 0: no NA */ double *valmd,/*[j]= missing value code (instead of NA) for x[,j]*/ int *jtmd, /* [j]= {-1,1}; -1: x[,j] has NA; 1: no NAs in x[,j] */ DISS_KIND *diss_kind, // aka 'metric' \in {EUCLIDEAN, MANHATTAN, JACCARD, GOWER} // NB: GOWER is *buggy* (<--> ../tests/clara-gower_valgrind.Rout ; ---*AND* disabled from R 2023-11-30 int/*logical*/ *rng_R,/*= {0,1}; 0 : use clara's internal weak RNG; * 1 : use R's RNG (and seed) */ int/*logical*/ *pam_like,/* if (1), we do "swap()" as in pam(), otherwise use the code as it was in clara() "forever" upto 2011-04 */ int *correct_d,/* option for dist.computation: if (0), use the "fishy" formula to update distances in the NA-case, if (1), use a dysta2()-compatible formula */ int *nrepr, /* logical (0/1): 1 = "is representative object" */ int *nsel, int *nbest,/* x[nbest[j],] : the j-th obs in the final sample */ int *nr, int *nrx,/* prov. and final "medoids" aka representatives */ double *radus, double *ttd, double *ratt, double *ttbes, double *rdbes, double *rabes, int *mtt, double *obj, double *avsyl, double *ttsyl, double *sylinf, int *jstop, int *trace_lev, double *tmp, /* = double [ 3 * nsam ] */ int *itmp /* = integer[ 6 * nsam ] */ ) { #define tmp1 tmp #define tmp2 &tmp[*nsam] #define ntmp1 itmp #define ntmp2 &itmp[*nsam] #define ntmp3 &itmp[nsamb] #define ntmp4 &itmp[nsamb+ *nsam] #define ntmp5 &itmp[2*nsamb] #define ntmp6 &itmp[2*nsamb+ *nsam] /* Local variables */ Rboolean has_NA = mdata[0]; int j, jk, js, l, nrun; *jstop = 0; double rnn = (double) (*n), zba = -1., sx = -1.;/* Wall */ /* n_dys := size of distance array dys[] (-1): dys[0] remains unused */ int n_dys = *nsam * (*nsam - 1) / 2;/* >= 0 */ int nsamb = *nsam * 2, n_sam; Rboolean full_sample = (*n == *nsam), /* only one sub sample == full data */ lrg_sam = (*n < nsamb); /* sample more than *n/2 */ if (lrg_sam)/* generate indices for the other, smaller half */ n_sam = *n - *nsam; else n_sam = *nsam; if(*trace_lev) Rprintf("C clara(): (nsam,nran,n) = (%d,%d,%d);%s\n", *nsam, *nran, *n, full_sample ? " 'full_sample',": (lrg_sam ? " 'large_sample',": "")); if(*rng_R && !full_sample) GetRNGstate(); else /* << initialize `random seed' of the very simple randm() below */ nrun = 0; #define NEW_rand_k_trace_print(_nr_) \ rand_k= 1+ (int)(rnn* ((*rng_R)? unif_rand(): randm(&nrun))); \ if (rand_k > *n) {/* should never happen */ \ warning(_("C level clara(): random k=%d > n **\n"), rand_k); \ rand_k = *n; \ } \ if(*trace_lev >= 4) { \ Rprintf("... {" #_nr_ "}"); \ if(*rng_R) Rprintf("R unif_rand()"); \ else Rprintf("nrun=%5d", nrun); \ Rprintf(" -> k{ran}=%d\n", rand_k); \ } /* __LOOP__ : random subsamples are drawn and partitioned into kk clusters */ Rboolean kall = FALSE; /* kall becomes TRUE iff we've found a "valid sample", i.e. one for which all d(j,k) can be computed */ Rboolean dyst_toomany_NA = FALSE; int nunfs = 0; for (int jran = 1; jran <= *nran; ++jran) { if (!full_sample) {/* `real' case: sample size < n */ if(*trace_lev) Rprintf("- clara sample %2d ", jran); int rand_k, ntt = 0; if (kall && nunfs+1 != jran && !lrg_sam) { /* Have had (at least) one valid sample; use its representatives * nrx[] : nsel[] := sort(nrx[]) for the first j=1:k */ if(*trace_lev >= 2) Rprintf(" if (kall && nunfs...): \n"); for (jk = 0; jk < *kk; ++jk) nsel[jk] = nrx[jk]; for (jk = 0; jk < *kk-1; ++jk) { /* sort(nsel[0:(kk-1)] */ /* FIXME: nsel[] is 0-indexed, but *contains* 1-indices*/ int nsm = nsel[jk], jsm = jk; for (int jkk = jk + 1; jkk < *kk; ++jkk) { if (nsm > nsel[jkk]) { nsm = nsel[jkk]; jsm = jkk; } } nsel[jsm] = nsel[jk]; nsel[jk] = nsm; } ntt = *kk; } else { /* no valid sample _OR_ lrg_sam */ if(*trace_lev >= 2) Rprintf(" finding 1st... new k{ran}:\n"); /* Loop finding random index `rand_k' not yet in nrx[0:(*kk-1)] : */ L180: NEW_rand_k_trace_print(180) if (kall) { for (jk = 0; jk < *kk; ++jk) if (rand_k == nrx[jk]) goto L180; } /* end Loop */ nsel[ntt] = rand_k; if (++ntt == n_sam) goto L295; } if(*trace_lev >= 2) { Rprintf(".. kall: %s, ", (kall) ? "T" : "FALSE"); if(*trace_lev == 2) { Rprintf("nsel[ntt=%d] = %d\n", ntt, nsel[ntt]); } else { /* trace_lev >= 3 */ Rprintf("\n... nrx [0:%d]= ",*kk-1); for (jk = 0; jk < *kk; jk++) Rprintf("%d ",nrx[jk]); Rprintf("\n... nsel[0:%d]= ",ntt-1); for (jk = 0; jk < ntt; jk++) Rprintf("%d ",nsel[jk]); Rprintf("\n"); } } do { /*------------------------------------------------------------ * Loop finding random index 'rand_k' in {1:n}, * not in nrx[0:(k-1)] nor nsel[1:ntt] : */ L210: NEW_rand_k_trace_print(210) if (kall && lrg_sam) { for (jk = 0; jk < *kk; ++jk) { if (rand_k == nrx[jk]) goto L210; } } /* insert rand_k into nsel[1:ntt] or after and increase ntt : */ for (int ka = 0; ka < ntt; ++ka) if (nsel[ka] >= rand_k) { if (nsel[ka] == rand_k) goto L210; else {// nsel[ka] > rand_k : for (int na = ntt; na > ka; --na) nsel[na] = nsel[na-1]; nsel[ka] = rand_k; /* continue _outer_ loop */ goto L290; } } // else: rand_k > nsel[ka] for all ka = 0:(ntt-1) : nsel[ntt] = rand_k; L290: ++ntt; } while (ntt < n_sam); L295: if(*trace_lev) Rprintf(" [ntt=%d, nunfs=%d] ", ntt, nunfs); if (lrg_sam) { /* have indices for smaller _nonsampled_ half; revert this: */ for (j = 1, jk = 0, js = 0; j <= *n; j++) { if (jk < n_sam && nsel[jk] == j) ++jk; else nrepr[js++] = j; } for (j = 0; j < *nsam; ++j) nsel[j] = nrepr[j]; } if(*trace_lev >= 3) { Rprintf(".. nsel[1:%d]= ", *nsam); for (jk = 0; jk < *nsam; jk++) Rprintf("%d ",nsel[jk]); } } else { /* full_sample : *n = *nsam -- one sample is enough ! */ for (j = 0; j < *nsam; ++j) nsel[j] = j+1;/* <- uses 1-indices for its *values*! */ } char nl[] = " "; if(*trace_lev) { Rprintf(" -> dysta2()"); if(*trace_lev >= 2) nl[0] = '\n'; } dysta2(*nsam, *jpp, nsel, x, *n, dys, *diss_kind, jtmd, valmd, has_NA, &dyst_toomany_NA); if(dyst_toomany_NA) { if(*trace_lev) Rprintf(" gave dyst_toomany_NA --> new sample.%s", nl); dyst_toomany_NA = FALSE; ++nunfs; continue;/* random sample*/ } else if(*trace_lev) Rprintf(";%s", nl); double s = 0., sky; for(l = 1; l <= n_dys; l++) /* dys[0] is not used here */ if (s < dys[l]) s = dys[l]; if(*trace_lev >= 2) Rprintf(" clara -> s:= max{dys[1..%d]} = %g;", n_dys,s); bswap2(*kk, *nsam, s, dys, *pam_like, *trace_lev, /* --> */ &sky, nrepr, /* dysma */tmp1, /*dysmb*/tmp2, /* beter[], only used here */&tmp[nsamb]); if(*trace_lev >= 2) Rprintf(" end{bswap2}: sky = %g\n", sky); Rboolean nafs; double zb; selec(*kk, *n, *jpp, *diss_kind, &zb, *nsam, has_NA, jtmd, valmd, *trace_lev, nrepr, nsel, dys, x, nr, &nafs, ttd, radus, ratt, ntmp1, ntmp2, ntmp3, ntmp4, ntmp5, ntmp6, tmp1, tmp2, *correct_d); if (nafs) { /* couldn't assign some observation (to a cluster) * because of too many NA s */ ++nunfs; if(*trace_lev >= 2) Rprintf(" selec() -> 'NAfs'"); } else if(!kall || zba > zb) { /* 1st proper sample or new best */ if(*trace_lev >= 2) Rprintf(kall ? "new best" : "1st proper sample"); kall = TRUE; zba = zb; for (jk = 0; jk < *kk; ++jk) { ttbes[jk] = ttd [jk]; rdbes[jk] = radus[jk]; rabes[jk] = ratt [jk]; nrx [jk] = nr [jk]; } for (js = 0; js < *nsam; ++js) nbest[js] = nsel[js]; sx = s; } /* if(*trace_lev) Rprintf("%sobj= %g\n", (*trace_lev < 2) ?", " :" ", zb/rnn); */ if(*trace_lev) Rprintf(" obj= %g\n", zb/rnn); if(full_sample) break; /* out of resampling */ } /* --- end random sampling loop */ if(*rng_R && !full_sample) PutRNGstate(); if(*trace_lev && !full_sample) { Rprintf("C clara() -> best sample _found_ "); if(*trace_lev >= 2) { Rprintf("; nbest[1:%d] =\n c(", *nsam); for (js = 0; js < *nsam; ) { Rprintf("%d", nbest[js]); if(++js < *nsam) Rprintf((js % 20) ? "," : ",\n "); } Rprintf(")\n"); } } *obj = zba / rnn; if (nunfs >= *nran) { *jstop = 1; return; } /* else */ if (!kall) { *jstop = 2; return; } /* for the best subsample, the objects of the entire data set are assigned to their clusters */ dysta2(*nsam, *jpp, nbest, x, *n, dys, *diss_kind, jtmd, valmd, has_NA, &dyst_toomany_NA); if(dyst_toomany_NA) { error(_("clara()'s C level dysta2(nsam=%d, p=%d, n=%d) gave 'toomany_NA'"), *nsam, *jpp, *n); } if(*trace_lev) Rprintf(" resul(), "); resul(*kk, *n, *jpp, *diss_kind, has_NA, jtmd, valmd, x, nrx, mtt, *correct_d); if (*kk > 1) { if(*trace_lev) Rprintf(" black()"); black(*kk, *jpp, *nsam, nbest, dys, sx, x, /* compute --> */ avsyl, ttsyl, sylinf, ntmp1, ntmp2, ntmp3, ntmp4, /* syl[] */ tmp1, tmp2); } if(*trace_lev) Rprintf(" and return() from C.\n"); return; } /* End clara() ---------------------------------------------------*/ #undef tmp1 #undef tmp2 #undef ntmp1 #undef ntmp2 #undef ntmp3 #undef ntmp4 #undef ntmp5 #undef ntmp6 /** * Compute Dissimilarities for the selected sub-sample ---> dys[,] */ void dysta2(int nsam, int jpp, int *nsel, double *x, int n, double *dys, DISS_KIND diss_kind, int *jtmd, double *valmd, Rboolean has_NA, Rboolean *toomany_NA) { int nlk = 0; int current_group = 0; // for GOWER only: Counter for overall index dys[0] = 0.;/* very first index; *is* used because ind_2(i,i) |-> 0 ! */ for (int l = 1; l < nsam; ++l) { int lsel = nsel[l]; if(lsel <= 0 || lsel > n) error(_("C level dysta2(): nsel[%s= %d] = %d is outside 0..n, n=%d"), "l", l, lsel, n); for (int k = 0; k < l; ++k) { /* compute d(nsel[l], nsel[k]) {if possible}*/ int ksel = nsel[k]; if(ksel <= 0 || ksel > n) error(_("C level dysta2(): nsel[%s= %d] = %d is outside 0..n, n=%d"), "k", k, ksel, n); ++nlk; int npres = 0, j, lj, kj, N_ones = 0; double clk = 0.; for (j = 0, lj = lsel-1, kj = ksel-1; j < jpp; ++j, lj += n, kj += n) { if (has_NA && jtmd[j] < 0) { /* x[,j] has some Missing (NA) */ /* in the following line (Fortran!), x[-2] ==> seg.fault {BDR to R-core, Sat, 3 Aug 2002} */ if (x[lj] == valmd[j] || x[kj] == valmd[j]) { continue /* next j */; } } ++npres; // number of present pairs if (diss_kind == EUCLIDEAN) clk += (x[lj] - x[kj]) * (x[lj] - x[kj]); else if (diss_kind == JACCARD) { if( x[lj] > 0.9 && x[kj] > 0.9) { // both "are 1" - increment numerator clk++ ; N_ones++ ; } else if( x[lj] > 0.9 || x[kj] > 0.9)// any is 1 - increment N_ones N_ones++ ; } else if (diss_kind == MANHATTAN) clk += fabs(x[lj] - x[kj]); // NB: GOWER is *buggy* (<--> ../tests/clara-gower_valgrind.Rout ; ---*AND* disabled from R 2023-11-30 else if (diss_kind == GOWER) { if (x[lj] == x[kj]) { continue /* next j */; } double diff; if (jtmd[j] >= 0) { // Numerical feature diff = fabs(x[lj] - x[kj]); // Find the maximum and minimum values of this feature double max_val = -1.0 / 0.0; double min_val = 1.0 / 0.0; for (int i = 0; i < nsam; ++i){ double val = x[i + nsam * current_group]; if (val > max_val) { max_val = val; } if (val < min_val) { min_val = val; } } current_group = (current_group + 1) % 2; // Toggle between 0 and 1 double range = max_val - min_val; diff /= range; } else { // Categorical feature diff = (x[lj] != x[kj]); } clk += diff; } } /* for( j ..) */ if (npres == 0) {/* cannot compute d(.,.) because of too many NA */ *toomany_NA = TRUE; dys[nlk] = -1.; } else { double d1 = clk * (jpp / (double) npres); dys[nlk] = (diss_kind == EUCLIDEAN) ? sqrt(d1) : (diss_kind == JACCARD) ? 1 - clk / (double) N_ones : (diss_kind == MANHATTAN) ? d1 : /* diss_kind == GOWER */ d1 / jpp; } } /* for( k ) */ } /* for( l ) */ return; } /* End dysta2() -----------------------------------------------------------*/ double randm(int *nrun) { /* we programmed this generator ourselves because we wanted it to be machine independent. it should run on most computers because the largest int used is less than 2^30 . the period is 2^16=65536, which is good enough for our purposes. */ /* MM: improved the original speed-wise only: */ *nrun = (*nrun * 5761 + 999) & 0177777; /* Masking off all but the last 16 bits is equivalent to % 65536 */ return ((double) (*nrun) / 65536.); } /* randm() */ /* bswap2() : called once [per random sample] from clara() : */ void bswap2(int kk, int n, /* == nsam == 'sampsize', here in clara */ double s, const double dys[], Rboolean pam_like, int trace_lev, // result (though *only* nrepr[] is really used in caller:) double *sky, int *nrepr, double *dysma, double *dysmb, double *beter) { int i, j, ij, k,h, hbest = -1, nbest = -1;/* init for -Wall */ double dzsky; /* Parameter adjustments */ --nrepr; --beter; --dysma; --dysmb; if(trace_lev >= 2) { if(trace_lev == 2) Rprintf("\n bswap2(), 1. BUILD:"); else Rprintf("\n bswap2(*, s=%g), 1. BUILD: ", s); } s = s * 1.1 + 1.;/* value larger than all dissimilarities */ /* ====== first algorithm: BUILD. ====== */ for (i = 1; i <= n; ++i) { nrepr[i] = 0; dysma[i] = s; } for(k = 0; k < kk; k++) { int nmax = -1; /* -Wall */ double ammax = 0.; for (i = 1; i <= n; ++i) { if (nrepr[i] == 0) { beter[i] = 0.; for (j = 1; j <= n; ++j) { double cmd = dysma[j] - dys[ ind_2(i, j)]; if (cmd > 0.) beter[i] += cmd; } if (ammax <= beter[i]) { /* does < (instead of <= ) work too? -- NO! */ ammax = beter[i]; nmax = i; } } } nrepr[nmax] = 1;/* = .true. : found new representative */ if(trace_lev >= 2) { if(trace_lev == 2) Rprintf(" %d", nmax); else Rprintf(" new repr. %d\n", nmax); } /* update dysma[] : dysma[j] = D(j, nearest_representative) */ for (j = 1; j <= n; ++j) { ij = ind_2(nmax, j); if (dysma[j] > dys[ij]) dysma[j] = dys[ij]; } } // output of the above loop: nrepr[], dysma[], ... *sky = 0.; for (j = 1; j <= n; ++j) *sky += dysma[j]; if(trace_lev >= 2) /* >= 2 (?) */ { Rprintf(" after build: medoids are"); for (i = 1; i <= n; ++i) if(nrepr[i] == 1) Rprintf(" %2d", i); if(trace_lev >= 3) { Rprintf("\n and min.dist dysma[1:n] are\n"); for (i = 1; i <= n; ++i) { Rprintf(" %6.3g", dysma[i]); if(i % 10 == 0) Rprintf("\n"); } if(n % 10 != 0) Rprintf("\n"); } else Rprintf("\n"); Rprintf(" --> sky = sum_j D_j= %g\n", *sky); } if (kk == 1) return; // asky = *sky / ((double) n); /* ====== second algorithm: SWAP. ====== */ /* Big LOOP : */ L60: for (j = 1; j <= n; ++j) { /* dysma[j] := D_j d(j, ) [KR p.102, 104] * dysmb[j] := E_j d(j, <2-nd cl.medi>) [p.103] */ dysma[j] = s; dysmb[j] = s; for (i = 1; i <= n; ++i) { if (nrepr[i]) { ij = ind_2(i, j); if (dysma[j] > dys[ij]) { dysmb[j] = dysma[j]; dysma[j] = dys[ij]; } else if (dysmb[j] > dys[ij]) { dysmb[j] = dys[ij]; } } } } dzsky = 1.; /* 1 is arbitrary > 0; only dzsky < 0 matters in the end */ for (h = 1; h <= n; ++h) if (!nrepr[h]) { for (i = 1; i <= n; ++i) if (nrepr[i]) { double dz = 0.; /* dz := T_{ih} := sum_j C_{jih} [p.104] : */ for (j = 1; j <= n; ++j) { int ij = ind_2(i, j), hj = ind_2(h, j); if (dys[ij] == dysma[j]) { double small; if(pam_like) small = dysmb[j] > dys[hj] ? dys[hj] : dysmb[j]; else // old clara code which differs from pam()'s // and seems a bit illogical: small = dysmb[j] > dys[ij] ? dys[hj] : dysmb[j]; dz += (- dysma[j] + small); } else if (dys[hj] < dysma[j]) dz += (- dysma[j] + dys[hj]); } if (dzsky > dz) { dzsky = dz; // dzsky := min_{i,h} T_{i,h} hbest = h; nbest = i; } } } /* once had some 64-bit compiler / data configuration that looped forever*/ R_CheckUserInterrupt(); if (dzsky < 0.) { /* found an improving swap */ if(trace_lev >= 3) Rprintf( " swp new %d <-> %d old; decreasing diss. by %g\n", hbest, nbest, dzsky); nrepr[hbest] = 1; nrepr[nbest] = 0; *sky += dzsky; goto L60; } if(trace_lev >= 2 && hbest != -1) // in my examples hbest == -1 and it does not print: Rprintf( " Last swap: new %d <-> %d old; decreasing diss. by %g\n", hbest, nbest, dzsky); } /* End of bswap2() -------------------------------------------------- */ /* selec() : called once [per random sample] from clara() */ void selec(int kk, int n, int jpp, DISS_KIND diss_kind, double *zb, int nsam, Rboolean has_NA, int *jtmd, double *valmd, int trace_lev, // currently unused -- FIXME !? int *nrepr, int *nsel, double *dys, double *x, int *nr, Rboolean *nafs, /* := TRUE if a distance cannot be calculated */ double *ttd, double *radus, double *ratt, // [i]tmp* for clara(), i.e. not used later! int *nrnew, int *nsnew, int *npnew, int *ns, int *np, int *new, double *ttnew, double *rdnew, int correct_d) { /* Parameter adjustments */ --nsel; --nrepr; --ratt; --radus; --ttd; --np; --nr; --ns; --rdnew; --ttnew; --npnew; --nrnew; --nsnew; --new; /* nafs := TRUE if a distance cannot be calculated (because of NA s)*/ *nafs = FALSE; /* identification of representative objects, and initializations */ for (int j = 1, jk = 0; j <= nsam; ++j) { if (nrepr[j] != 0) { ++jk; nr [jk] = nsel[j]; ns [jk] = 0; ttd [jk] = 0.; radus[jk] = -1.; np [jk] = j; } } /* - assignment of the objects of the entire data set to a cluster, * - computation of some statistics, * - determination of the new ordering of the clusters */ double pp = (double) (jpp); *zb = 0.; int newf = 0; for(int i = 1; i <= n; i++) { /* for obj i, find its cluster jk in 1:kk */ int jkabc = -1/* -Wall */; double dsum, dnull = -9./* -Wall */; if (!has_NA) { for (int jk = 1; jk <= kk; ++jk) { dsum = 0.; int nr_k = nr[jk]; if (nr_k != i) { int N_ones = 0; double tra = 0.; // init only for JACCARD for (int jp = 0; jp < jpp; ++jp) { int na = (nr_k - 1) + jp * n, nb = (i - 1) + jp * n; // d_j(x[na], x[nb]) : if (diss_kind == JACCARD) { if(x[na] > 0.9 && x[nb] > 0.9) { // both "are 1" - increment numerator (and denom.) tra += 1; N_ones ++; } else if( x[na] > 0.9 || x[nb] > 0.9) { // any is 1 - increment denominator N_ones N_ones ++; } } else { // Euclidean or Manhattan tra = fabs(x[na] - x[nb]); if (diss_kind == EUCLIDEAN) tra *= tra; dsum += tra; } } if (diss_kind == JACCARD) dsum = 1 - tra / (double)N_ones; if (jk != 1 && dsum >= dnull) continue /* next jk */; } /* (nr_k != i) */ // new best: dsum < "previous" dnull //---------------- FIXME for JACCARD ------------------------- dnull = dsum; jkabc = jk; } /* for (jk 1:kk) */ } else { // _has_ missing data Rboolean first = TRUE; for (int jk = 1; jk <= kk; ++jk) { dsum = 0.; int nr_k = nr[jk]; if (nr_k != i) { int nobs = 0, N_ones = 0; double tra = 0.; // init only for JACCARD for (int jp = 0; jp < jpp; ++jp) { int na = (nr_k - 1) + jp * n, nb = (i - 1) + jp * n; if (jtmd[jp] < 0) { if (x[na] == valmd[jp] || x[nb] == valmd[jp]) continue /* next jp */; } nobs++; if (diss_kind == JACCARD) { if(x[na] > 0.9 && x[nb] > 0.9) { // both "are 1" - increment numerator (and denom.) tra += 1; N_ones ++; } else if( x[na] > 0.9 || x[nb] > 0.9) { // any is 1 - increment denominator N_ones N_ones ++; } } else { // Euclidean or Manhattan tra = fabs(x[na] - x[nb]); if (diss_kind == EUCLIDEAN) tra *= tra; dsum += tra; } } if (nobs == 0) /* all pairs partially missing */ continue /* next jk */; if (diss_kind == JACCARD) dsum = 1 - tra / (double)N_ones; if(correct_d) // correct -- only since 2017-06 dsum *= (pp / nobs); else dsum *= (nobs / pp); } if (first) first = FALSE; else if (dnull <= dsum) continue /* next jk */; /* here : first was TRUE {i.e. 1st time} or * dnull > dsum {i.e. new best} */ dnull = dsum; jkabc = jk; }/* for(jk ..) */ if (first) { /* found nothing */ *nafs = TRUE; return; } } /* else: has_NA */ if (diss_kind == EUCLIDEAN) dnull = sqrt(dnull); *zb += dnull; ttd[jkabc] += dnull; if (radus[jkabc] < dnull) radus[jkabc] = dnull; ++ns[jkabc]; if (newf < kk) { if (newf != 0) { for (int jnew = 1; jnew <= newf; ++jnew) { if (jkabc == new[jnew]) goto L90;/* next i */ } } ++newf; new[newf] = jkabc; } L90: ; } /* for( i = 1..n ) -------------------------------------------------*/ /* a permutation is carried out on vectors nr,ns,np,ttd,radus using the information in vector new. */ for (int j = 1; j <= kk; ++j) { int nj = new[j]; nrnew[j] = nr[nj]; nsnew[j] = ns[nj]; npnew[j] = np[nj]; ttnew[j] = ttd[nj]; rdnew[j] = radus[nj]; } for (int j = 1; j <= kk; ++j) { nr[j] = nrnew[j]; ns[j] = nsnew[j]; np[j] = npnew[j]; ttd[j] = ttnew[j] / (double) ns[j]; radus[j] = rdnew[j]; } if (kk > 1) { /* computation of ratt[ka] := minimal distance of medoid ka to any other medoid for comparison with the radius of cluster ka. */ for (int ka = 1; ka <= kk; ++ka) { Rboolean first = TRUE; int npa = np[ka]; for (int kb = 1; kb <= kk; ++kb) { if (kb == ka) continue /* next kb */; int npb = np[kb], npab = ind_2(npa, npb); if (first) first = FALSE; else if (dys[npab] >= ratt[ka]) continue /* next kb */; ratt[ka] = dys[npab]; if (ratt[ka] == 0.) ratt[ka] = -1.; } if (ratt[ka] > -0.5) ratt[ka] = radus[ka] / ratt[ka]; } } return; } /* End selec() -----------------------------------------------------------*/ void resul(int kk, int n, int jpp, DISS_KIND diss_kind, Rboolean has_NA, int *jtmd, double *valmd, double *x, int *nrx, int *mtt, int correct_d) /* option for dist.computation: if (0), use the "fishy" formula to update distances in the NA-case, if (1), use a dysta2()-compatible formula */ { // __FIXME__ "Jaccard" not yet supported ! <<<<<<<<<<<<<<<<<<<<<<<<<<<<< /* clustering vector is incorporated into x, and ``printed''. */ for(int i = 0; i < n; i++) { for (int jk = 0; jk < kk; ++jk) { if (nrx[jk] == i + 1)/* 1-indexing */ goto L220; /* continue next i (i.e., outer loop) */ } int njnb = i, nrjk, /* = nrx[jk -1 */ jksky = -1/* Wall */; double dnull = -9./* Wall */; if (!has_NA) { for (int jk = 0; jk < kk; ++jk) { double dsum = 0.; nrjk = (nrx[jk] - 1); for (int j = 0; j < jpp; ++j) { double tra = fabs(x[nrjk + j * n] - x[njnb + j * n]); if (diss_kind == EUCLIDEAN) tra *= tra; dsum += tra; } if (diss_kind == EUCLIDEAN) dsum = sqrt(dsum); if (jk == 0 || dnull > dsum) { // have new best dnull = dsum; jksky = jk; } } } else { /* _has_ missing data */ double pp = (double) jpp; for (int jk = 0; jk < kk; ++jk) { double dsum = 0.; nrjk = (nrx[jk] - 1); int nobs = 0; for (int j = 0; j < jpp; ++j) { int na = nrjk + j * n, nb = njnb + j * n; if (jtmd[j] < 0) { if (x[na] == valmd[j] || x[nb] == valmd[j]) continue /* next j */; } nobs++; double tra = fabs(x[na] - x[nb]); if (diss_kind == EUCLIDEAN) tra *= tra; dsum += tra; } if (diss_kind == EUCLIDEAN) dsum = sqrt(dsum); if(correct_d) // correct -- only since 2016-04 dsum *= (pp / nobs); else dsum *= (nobs / pp); // MM: "fishy" (had note since r4321, 2007-05-01 !) if (jk == 0 || dnull > dsum) { // have new best dnull = dsum; jksky = jk; } } } x[njnb] = (double) jksky + 1;/* 1-indexing */ L220: ; } /* for(i = 0; i < n ..)*/ for (int jk = 0; jk < kk; ++jk) x[nrx[jk] - 1] = (double) jk + 1;/* 1-indexing */ /* mtt[k] := size(k-th cluster) : */ for (int ka = 0; ka < kk; ++ka) { mtt[ka] = 0; for(int i = 0; i < n; i++) { if (((int) x[i]) == ka + 1)/* 1-indexing */ ++mtt[ka]; } } return; } /* end resul() -----------------------------------------------------------*/ // called 'dark()' in ./pam.c void black(int kk, int jpp, int nsam, int *nbest, double *dys, double s, double *x, /* --> Output : */ double *avsyl, double *ttsyl, double *sylinf, /* but the following output vectors are never used by clara() : */ int *ncluv, int *nsend, int *nelem, int *negbr, double *syl, double *srank) { /* Silhouettes computation and "drawing" --> syl[] and sylinf[] */ /* System generated locals */ int sylinf_dim1 = nsam, sylinf_offset = 1 + sylinf_dim1 * 1; /* Parameter adjustments */ --avsyl; --srank; --syl; --negbr; --nelem; --nsend; --ncluv; --nbest; sylinf -= sylinf_offset; /* construction of clustering vector (ncluv) of selected sample (nbest). */ for (int l = 1; l <= nsam; ++l) { ncluv[l] = (int) x[nbest[l] - 1]; } /* "drawing" of the silhouettes */ int nsylr = 0; *ttsyl = 0.; for (int numcl = 1; numcl <= kk; ++numcl) { int ntt = 0, j, l; double dysa, dysb; for (j = 1; j <= nsam; ++j) { if (ncluv[j] == numcl) { ++ntt; nelem[ntt] = j; } } for (j = 1; j <= ntt; ++j) { int nj = nelem[j]; dysb = s * 1.1 + 1.; negbr[j] = -1; for (int nclu = 1; nclu <= kk; ++nclu) { if (nclu != numcl) { int nbb = 0; double db = 0.; for (l = 1; l <= nsam; ++l) { if (ncluv[l] == nclu) { ++nbb; db += dys[ind_2(nj, l)]; } } db /= (double) nbb; if (db < dysb) { dysb = db; negbr[j] = nclu; } } } if (ntt == 1) { syl[j] = 0.; continue /* j */; } dysa = 0.; for (l = 1; l <= ntt; ++l) { int nl = nelem[l]; dysa += dys[ind_2(nj, nl)]; } dysa /= (double) (ntt - 1); if (dysa <= 0.) { if (dysb > 0.) syl[j] = 1.; else syl[j] = 0.; continue /* j */; } if (dysb > 0.) { if (dysb > dysa) syl[j] = 1. - dysa / dysb; else if (dysb < dysa) syl[j] = dysb / dysa - 1.; else /* (dysb == dysa) */ syl[j] = 0.; if (syl[j] < -1.) syl[j] = -1.; else if (syl[j] > 1.) syl[j] = 1.; } else { syl[j] = -1.; } } /* for(j ..) */ avsyl[numcl] = 0.; for (j = 1; j <= ntt; ++j) { int lang = -1/* -Wall */; double symax = -2.; for (l = 1; l <= ntt; ++l) { if (syl[l] > symax) { symax = syl[l]; lang = l; } } nsend[j] = lang; srank[j] = syl[lang]; avsyl[numcl] += srank[j]; syl[lang] = -3.; } *ttsyl += avsyl[numcl]; avsyl[numcl] /= ntt; if (ntt >= 2) { for (l = 1; l <= ntt; ++l) { int lplac = nsend[l]; ++nsylr; sylinf[nsylr + sylinf_dim1 ] = (double) numcl; sylinf[nsylr + (sylinf_dim1 << 1)] = (double) negbr[lplac]; sylinf[nsylr + sylinf_dim1 * 3 ] = srank[l]; sylinf[nsylr + (sylinf_dim1 << 2)] = (double) nbest[nelem[lplac]]; } } else { ++nsylr; sylinf[nsylr + sylinf_dim1 ] = (double) numcl; sylinf[nsylr + (sylinf_dim1 << 1)] = (double) negbr[1]; sylinf[nsylr + sylinf_dim1 * 3 ] = 0.; sylinf[nsylr + (sylinf_dim1 << 2)] = (double) nbest[nelem[1]]; } } /* for( numcl ) */ *ttsyl /= (double) (nsam); return; } /* black */