R version 3.0.0 beta (2013-03-25 r62402) -- "Masked Marvel" Copyright (C) 2013 The R Foundation for Statistical Computing ISBN 3-900051-07-0 Platform: x86_64-unknown-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > ### Clara with really LARGE data set --- CPU testing > library(cluster) > > > ## generate 15*N objects, divided into 2 clusters. > N <- 10000 > ## For back-compatibility: > if(R.version$major != "1" || as.numeric(R.version$minor) >= 7) RNGversion("1.6") Warning message: In RNGkind("Marsaglia-Multicarry", "Buggy Kinderman-Ramage") : buggy version of Kinderman-Ramage generator used > set.seed(521) > x <- rbind(cbind(rnorm(7*N, 0,8), rnorm(7*N, 0,8)), + cbind(rnorm(8*N,50,8), rnorm(8*N,10,8))) > .proctime00 <- proc.time() > for(nn in 1:3) + print(clara2 <- clara(x[sample(nrow(x)),], 2, sampsize = 128, samples= 500)) Call: clara(x = x[sample(nrow(x)), ], k = 2, samples = 500, sampsize = 128) Medoids: [,1] [,2] [1,] 49.88899769 10.19515443 [2,] -0.02950822 0.07221797 Objective function: 10.03208 Clustering vector: int [1:150000] 1 1 1 1 1 2 2 2 1 2 2 1 1 1 1 2 2 2 ... Cluster sizes: 79995 70005 Best sample: [1] 4780 5896 6755 6965 7570 9790 10215 11319 11655 13685 [11] 14578 16700 18583 19389 20392 21392 22106 22147 24356 24457 [21] 25280 25315 25972 27549 30094 30858 32033 34276 38148 38714 [31] 41208 43442 49109 49466 50322 50881 53808 55166 55438 56486 [41] 56594 56749 57102 57413 59290 59544 60878 60970 63263 63826 [51] 66683 67644 69121 69299 69539 70093 70338 72995 76454 76928 [61] 77447 79800 81254 83059 84410 84813 84870 86419 87546 88887 [71] 89200 90679 91393 92837 94064 94801 96097 97827 99898 100583 [81] 100917 101118 102503 102755 105611 106218 106476 107023 107101 108001 [91] 108429 108468 109678 110081 111567 111571 113306 113318 113350 115771 [101] 117401 118852 120271 123487 125581 126037 127309 128083 128305 128737 [111] 131035 131262 132502 133022 133173 134450 134823 134947 139410 140360 [121] 141342 144734 145370 145762 146215 147860 148307 148357 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x[sample(nrow(x)), ], k = 2, samples = 500, sampsize = 128) Medoids: [,1] [,2] [1,] 49.829401 10.06984380 [2,] 0.185889 -0.06736435 Objective function: 10.033 Clustering vector: int [1:150000] 1 1 1 1 2 2 2 2 2 1 1 1 2 2 1 1 1 2 ... Cluster sizes: 79991 70009 Best sample: [1] 1820 2385 4880 5956 6897 7332 11664 12781 13138 15045 [11] 19352 20167 20421 21085 22813 23216 23584 24017 24550 24994 [21] 25420 25979 26283 30263 30435 31691 31987 33211 33271 33694 [31] 35425 36068 38180 38924 40600 41781 42007 42742 43472 44699 [41] 46026 46731 48082 48404 50915 51217 55994 57736 61753 64589 [51] 65777 66175 66491 66797 69084 69283 69890 72139 72245 72421 [61] 74996 76978 77022 77646 77852 78940 79443 80192 81073 81718 [71] 82524 82936 83098 85458 86669 89052 89253 91755 91977 93540 [81] 94707 94934 96694 97387 97410 98122 101134 101484 102176 102407 [91] 103082 103835 104032 107200 107399 109388 109433 111953 112054 113361 [101] 113716 115213 118451 119621 120191 120637 121312 122276 122679 123093 [111] 123455 131351 132255 132869 134063 134968 135064 135778 138952 138987 [121] 139298 140058 141406 143036 145100 145704 147545 147947 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x[sample(nrow(x)), ], k = 2, samples = 500, sampsize = 128) Medoids: [,1] [,2] [1,] 50.091093670 10.037950 [2,] 0.009723334 0.108578 Objective function: 10.03159 Clustering vector: int [1:150000] 1 2 1 2 2 1 1 2 1 2 2 1 2 2 2 1 2 1 ... Cluster sizes: 79988 70012 Best sample: [1] 4180 5558 6036 6167 6189 7595 8057 10607 10925 11561 [11] 11879 13308 13379 15008 16057 16862 17743 23994 28691 28869 [21] 29497 29908 30785 32877 32927 36024 36226 37018 40467 44383 [31] 44536 45250 45990 46225 48457 50647 51270 52408 54371 55667 [41] 56717 57397 60153 60689 61451 61858 62119 64664 65317 65429 [51] 65445 66594 66671 67610 67999 68704 69249 69652 72888 73190 [61] 76227 78013 79228 79709 79841 79894 84023 84893 85451 88505 [71] 89736 91164 91672 92073 92677 92684 93860 93906 94394 94517 [81] 94920 95540 97834 98397 98793 100072 100912 101253 101416 102215 [91] 102924 103045 104304 104664 107431 107566 113949 115824 116325 116906 [101] 117140 119360 119440 119786 120189 123256 123459 123771 124148 124531 [111] 125055 125249 125963 128756 128959 131612 133926 134027 134469 137185 [121] 137325 139353 142571 145054 146137 146142 147254 147611 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" > cat('Time elapsed: ', proc.time() - .proctime00,'\n') Time elapsed: 13.134 0.009 13.193 0 0 > ## nb-mm, with cluster 1.6-4 : 85.2 (elapsed 91.7) > > ###-- Larger example: 20*N objects, divided into 5 clusters. > N <- 20000 # 100'000 would give swapping like crazy for 256 MB RAM > x5 <- rbind(cbind(rnorm(4*N, 0,4), rnorm(4*N, 0,4)), + cbind(rnorm(4*N,10,8), rnorm(4*N,40,6)), + cbind(rnorm(4*N,30,4), rnorm(4*N, 0,4)), + cbind(rnorm(4*N,40,4), rnorm(4*N,20,2)), + cbind(rnorm(4*N,50,4), rnorm(4*N,50,4))) > ## plus 1 random dimension > x5 <- cbind(x5, rnorm(nrow(x5))) > > .proctime00 <- proc.time() > for(nn in 1:3) + print(clara(x5[sample(nrow(x5)),], 5, samples= 100)) Call: clara(x = x5[sample(nrow(x5)), ], k = 5, samples = 100) Medoids: [,1] [,2] [,3] [1,] 41.224583 19.87794800 0.44644652 [2,] 50.198633 50.52874736 0.41713900 [3,] -1.116743 -0.61545329 -0.01433129 [4,] 10.480461 40.99909160 -0.83618494 [5,] 30.373695 -0.07214946 0.05324887 Objective function: 5.753006 Clustering vector: int [1:400000] 1 2 3 2 3 2 4 4 3 5 2 5 2 5 1 5 4 1 ... Cluster sizes: 80575 80225 80038 79306 79856 Best sample: [1] 26722 33924 43964 45752 50483 67536 69874 80530 106348 108637 [11] 110810 127118 129499 142328 149683 164087 172663 202009 202204 224817 [21] 234113 237732 238459 242896 248731 260828 271332 274946 275422 277540 [31] 285279 294501 307300 316791 316944 327656 328492 332435 334339 338190 [41] 346882 357020 358844 360273 364637 376227 378626 379767 388135 392804 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x5[sample(nrow(x5)), ], k = 5, samples = 100) Medoids: [,1] [,2] [,3] [1,] 39.858204 19.6762455 -0.1890898 [2,] 1.144668 -0.3181352 0.5330045 [3,] 9.238556 41.3575421 -0.3500917 [4,] 49.468401 50.8443023 -0.2939720 [5,] 29.003122 -0.1621678 -0.4323139 Objective function: 5.750034 Clustering vector: int [1:400000] 1 2 3 3 1 4 4 4 5 5 2 1 2 4 4 1 1 3 ... Cluster sizes: 80935 80057 79006 80302 79700 Best sample: [1] 1117 5939 18976 24903 33191 35499 36982 40809 42963 47053 [11] 53290 53711 93116 100989 106287 110096 110346 125855 140259 164307 [21] 164362 173053 176655 185016 202399 208606 219068 220673 234809 239106 [31] 251111 253827 254798 260285 261451 264631 268226 268500 276221 282520 [41] 293708 301594 302894 304664 313904 323322 337000 356702 370136 381745 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x5[sample(nrow(x5)), ], k = 5, samples = 100) Medoids: [,1] [,2] [,3] [1,] -0.1491132 0.1494221 0.1276721 [2,] 10.4575175 40.0379897 1.2770739 [3,] 40.3665263 19.5020324 0.0199130 [4,] 49.5882360 49.5518009 0.3284041 [5,] 30.4089200 0.6629704 0.1832471 Objective function: 5.715827 Clustering vector: int [1:400000] 1 2 3 1 2 4 1 2 4 3 5 5 1 3 1 5 1 5 ... Cluster sizes: 80054 79249 80574 80291 79832 Best sample: [1] 11780 39887 42060 73578 80933 86823 95337 102125 104163 149054 [11] 156067 165363 168982 169709 174146 179981 184461 190577 192078 208790 [21] 216529 225751 228400 248041 248194 258906 259742 269440 278132 282746 [31] 288270 298011 307270 307477 309876 311017 313624 315424 319385 319642 [41] 320337 324054 342597 351172 357972 361365 365174 375214 381733 398786 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" > ## Last Line: > cat("Time elapsed: ", proc.time() - .proctime00,"\n") Time elapsed: 13.98 0.014 14.048 0 0 > ## nb-mm, with cluster 1.6-4 : 74.4 (elapsed 88.2) > ## ~~~~~ 1.7-* quite faster : 67.6 (elapsed 68.7) > > proc.time() user system elapsed 27.72 0.07 27.97