/* * The routines for gini-classification */ #include #include "rpart.h" #include "rpartproto.h" static int numclass; static double *left, /* left branch n (weighted) */ *right, **ccnt; static double *prior, *aprior, /* altered priors */ *freq, *loss; /* loss matrix */ static int *tsplit, *countn; static double *awt, *rate; static double (*impurity) (); static double gini_impure1(double p) { return p * (1.0 - p); } static double gini_impure2(double p) { if (p == 0) return 0.0; else return -p * log(p); } int giniinit(int n, double **y, int maxcat, char **error, double *parm, int *size, int who, double *wt) { int i, j, k; double temp; /* allocate memory and setup losses */ if (who == 1) { numclass = 0; /* number of classes */ for (i = 0; i < n; i++) if (*y[i] > numclass) numclass = (int) *y[i]; if (parm[numclass + numclass * numclass] == 2) impurity = gini_impure2; else impurity = gini_impure1; left = (double *) ALLOC(numclass * 2, sizeof(double)); right = left + numclass; tsplit = (int *) ALLOC(maxcat * 2, sizeof(int)); countn = tsplit + maxcat; awt = (double *) ALLOC(maxcat * 2, sizeof(double)); rate = awt + maxcat; if (maxcat > 0) { graycode_init0(maxcat); ccnt = (double **) ALLOC(numclass, sizeof(double *)); ccnt[0] = (double *) ALLOC(numclass * maxcat, sizeof(double)); for (i = 1; i < numclass; i++) ccnt[i] = ccnt[i - 1] + maxcat; } i = 3 * numclass + numclass * numclass; prior = (double *) ALLOC(i, sizeof(double)); aprior = prior + numclass; freq = aprior + numclass; loss = freq + numclass; for (i = 0; i < numclass; i++) freq[i] = 0; temp = 0; for (i = 0; i < n; i++) { j = (int) *y[i] - 1; freq[j] += wt[i]; temp += wt[i]; /* sum total of weights */ } for (i = 0; i < numclass; i++) freq[i] /= temp; /* relative frequency */ temp = 0; for (i = 0; i < numclass; i++) { prior[i] = parm[i]; aprior[i] = 0; for (j = 0; j < numclass; j++) { k = numclass * j + i; loss[k] = parm[numclass + k]; temp += loss[k] * prior[i]; aprior[i] += loss[k] * prior[i]; } } for (i = 0; i < numclass; i++) { if (freq[i] > 0) { /* watch out for a missing class */ prior[i] /= freq[i]; aprior[i] /= (temp * freq[i]); /* pi_i / n_i */ } } } *size = 2 + numclass; return 0; } /* * Compute the predicted response and the classification error * This is R(T) (this node's contribution) in the paper. */ void ginidev(int n, double **y, double *value, double *risk, double *wt) { int i, j, max = 0; double temp, dev = 0; double prob; /* * count up number in each class, * and P(T), the probability of reaching this branch of the tree */ for (i = 0; i < numclass; i++) freq[i] = 0; temp = 0; for (i = 0; i < n; i++) { j = (int) y[i][0] - 1; freq[j] += wt[i]; temp += wt[i] * prior[j]; } prob = temp; /* this is actually P(T)*n; R code will fix * it up */ /* * Now compute best class and its error */ for (i = 0; i < numclass; i++) { /* assume class i were the prediction */ temp = 0; for (j = 0; j < numclass; j++) temp += freq[j] * loss[i * numclass + j] * prior[j]; if (i == 0 || temp < dev) { max = i; dev = temp; } } value[0] = max + 1; /* remember: external groups start at 1 */ for (i = 0; i < numclass; i++) value[i + 1] = freq[i]; value[numclass + 1] = prob; *risk = dev; } /* * return the error for a particular point */ double ginipred(double *y, double *pred) { int i, j; double temp; i = (int) y[0] - 1; j = (int) *pred - 1; temp = prior[i] * loss[i * numclass + j]; return temp; } /* * The gini splitting function. Find that split point in x such that * the rss within the two groups is decreased as much * as possible. */ void gini(int n, double *y[], double *x, int numcat, int edge, double *improve, double *split, int *csplit, double my_risk, double *wt) { int i, j, k; double lwt, rwt; int rtot, ltot; int direction = LEFT, where = 0; double total_ss, best, temp, p; double lmean, rmean; /* used to decide direction */ for (i = 0; i < numclass; i++) { left[i] = 0; right[i] = 0; } lwt = 0; rwt = 0; rtot = 0; ltot = 0; for (i = 0; i < n; i++) { j = (int) *y[i] - 1; rwt += aprior[j] * wt[i]; /* altered weight = prior * case_weight */ right[j] += wt[i]; rtot++; } total_ss = 0; for (i = 0; i < numclass; i++) { temp = aprior[i] * right[i] / rwt; /* p(class=i, given node A) */ total_ss += rwt * (*impurity) (temp); /* p(A) * I(A) */ } best = total_ss; /* total weight of right * impurity of right + 0 *0 */ /* * at this point we split into 2 disjoint paths */ if (numcat > 0) goto categorical; for (i = 0; rtot > edge; i++) { j = (int) *y[i] - 1; rwt -= aprior[j] * wt[i]; lwt += aprior[j] * wt[i]; rtot--; ltot++; right[j] -= wt[i]; left[j] += wt[i]; if (ltot >= edge && x[i + 1] != x[i]) { temp = 0; lmean = 0; rmean = 0; for (j = 0; j < numclass; j++) { p = aprior[j] * left[j] / lwt; /* p(j | left) */ temp += lwt * (*impurity) (p); /* p(left) * I(left) */ lmean += p * j; p = aprior[j] * right[j] / rwt; /* p(j | right) */ temp += rwt * (*impurity) (p); /* p(right) * I(right) */ rmean += p * j; } if (temp < best) { best = temp; where = i; direction = lmean < rmean ? LEFT : RIGHT; } } } *improve = total_ss - best; if (*improve > 0) { /* found something */ csplit[0] = direction; *split = (x[where] + x[where + 1]) / 2; } return; categorical:; /* * First collapse the data into a numclass x numcat array * ccnt[i][j] = number of class i obs, category j of the predictor */ for (j = 0; j < numcat; j++) { awt[j] = 0; countn[j] = 0; for (i = 0; i < numclass; i++) ccnt[i][j] = 0; } for (i = 0; i < n; i++) { j = (int) *y[i] - 1; k = (int) x[i] - 1; awt[k] += aprior[j] * wt[i]; countn[k]++; ccnt[j][k] += wt[i]; } for (i = 0; i < numcat; i++) { if (awt[i] == 0) tsplit[i] = 0; else { rate[i] = ccnt[0][i] / awt[i]; /* a scratch array */ tsplit[i] = RIGHT; } } if (numclass == 2) graycode_init2(numcat, countn, rate); else graycode_init1(numcat, countn); while ((i = graycode()) < numcat) { /* item i changes groups */ if (tsplit[i] == LEFT) { tsplit[i] = RIGHT; rwt += awt[i]; lwt -= awt[i]; rtot += countn[i]; ltot -= countn[i]; for (j = 0; j < numclass; j++) { right[j] += ccnt[j][i]; left[j] -= ccnt[j][i]; } } else { tsplit[i] = LEFT; rwt -= awt[i]; lwt += awt[i]; rtot -= countn[i]; ltot += countn[i]; for (j = 0; j < numclass; j++) { right[j] -= ccnt[j][i]; left[j] += ccnt[j][i]; } } if (ltot >= edge && rtot >= edge) { temp = 0; lmean = 0; rmean = 0; for (j = 0; j < numclass; j++) { p = aprior[j] * left[j] / lwt; temp += lwt * (*impurity) (p); lmean += p * j; p = aprior[j] * right[j] / rwt; /* p(j | right) */ temp += rwt * (*impurity) (p); /* p(right) * I(right) */ rmean += p * j; } if (temp < best) { best = temp; if (lmean < rmean) for (j = 0; j < numcat; j++) csplit[j] = tsplit[j]; else for (j = 0; j < numcat; j++) csplit[j] = -tsplit[j]; } } } *improve = total_ss - best; }