R Under development (unstable) (2023-10-19 r85356) -- "Unsuffered Consequences" Copyright (C) 2023 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > ### Clara with really LARGE data set --- CPU testing > if(!cluster:::doExtras()) q("no") > > library(cluster) > > ## generate 15*N objects, divided into 2 clusters. > N <- 10000 > suppressWarnings(RNGversion("3.5.0")) # << as long as we don't have R >= 3.6.0 > set.seed(521) > x <- rbind(cbind(rnorm(7*N, 0,8), rnorm(7*N, 0,8)), + cbind(rnorm(8*N,50,8), rnorm(8*N,10,8))) > .proctime00 <- proc.time() > > for(nn in 1:3) + print(clara2 <- clara(x[sample(nrow(x)),], 2, sampsize = 128, samples= 500)) Call: clara(x = x[sample(nrow(x)), ], k = 2, samples = 500, sampsize = 128) Medoids: [,1] [,2] [1,] 49.9788949 9.9439603 [2,] 0.1102125 -0.2841381 Objective function: 10.02584 Clustering vector: int [1:150000] 1 1 2 2 2 2 2 1 1 2 2 2 1 1 1 2 1 1 ... Cluster sizes: 79981 70019 Best sample: [1] 1477 1701 4107 4356 4759 4821 5038 9625 10218 11557 [11] 14191 14335 18496 18597 19282 21021 21428 21756 24235 24999 [21] 29625 30005 30028 31534 34102 34793 35798 37583 37880 39309 [31] 39464 40016 40606 40888 43390 43593 44463 45022 45495 46132 [41] 46333 46541 48075 49398 51433 52254 53476 54893 55111 56022 [51] 57404 58564 59643 61785 62494 63611 64067 64479 67136 69370 [61] 70595 70796 71588 73394 74023 74108 74415 75653 77717 79862 [71] 80162 82613 83030 84101 85264 85470 88326 89514 89949 90716 [81] 91182 91968 94286 98923 100015 101164 101970 102180 102784 103274 [91] 104222 106634 106824 107181 107760 108751 108900 111452 113052 113750 [101] 113798 114279 114412 114604 114817 115128 115737 116311 117165 117627 [111] 120797 121450 127247 127808 128644 128930 129490 131072 133363 133532 [121] 135986 137348 137616 138968 142001 142585 146915 148520 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x[sample(nrow(x)), ], k = 2, samples = 500, sampsize = 128) Medoids: [,1] [,2] [1,] 49.9175264 10.1438794 [2,] 0.1102125 -0.2841381 Objective function: 10.02635 Clustering vector: int [1:150000] 1 1 2 2 2 2 2 2 2 1 2 1 2 1 2 1 1 1 ... Cluster sizes: 79983 70017 Best sample: [1] 2967 6883 7036 7750 8490 8725 10957 13147 13770 14908 [11] 16871 17121 17878 18167 19217 19897 22653 23189 23951 24358 [21] 24619 27164 27929 27945 29094 29171 30110 30499 31204 31749 [31] 32152 35388 35690 38727 40513 41728 42209 42341 42394 44475 [41] 46523 47393 47951 51005 52236 53664 54172 54573 55177 55184 [51] 56360 56406 56894 57017 57420 58040 60334 60897 61293 62572 [61] 63412 63753 63916 64715 65424 65545 67164 69931 70066 76449 [71] 78324 78825 79406 79640 81940 82286 82689 85756 85959 86271 [81] 86648 87031 87555 87749 88463 91256 91459 94112 96426 96527 [91] 96969 97216 99685 99825 101853 105071 107554 108637 109463 109754 [101] 110111 116680 118058 118536 118667 120095 120557 123107 123425 124061 [111] 124379 125808 125879 127508 128557 129362 130243 134379 136494 141191 [121] 141369 141997 142408 143285 145427 148524 148726 149518 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x[sample(nrow(x)), ], k = 2, samples = 500, sampsize = 128) Medoids: [,1] [,2] [1,] 0.1392328 -0.1579963 [2,] 49.9788949 9.9439603 Objective function: 10.02523 Clustering vector: int [1:150000] 1 2 2 2 1 1 2 2 2 2 2 2 1 1 2 2 1 1 ... Cluster sizes: 70020 79980 Best sample: [1] 1770 2921 3843 4223 4246 4992 5420 5752 6386 7256 [11] 7815 8320 9012 12099 13527 13683 14235 15047 15107 17904 [21] 18789 18890 19714 20350 20552 20760 22049 22294 22435 23617 [31] 24578 26457 27473 27695 29112 29515 30240 32783 36713 37830 [41] 38286 39091 39705 40899 43589 47612 48242 48327 48633 49872 [51] 51936 53975 54081 54381 54783 56832 58656 59482 59688 62027 [61] 62545 63733 64168 64934 68505 71881 73142 73591 75376 76188 [71] 77003 77493 79649 80853 82970 83119 84243 85671 87271 87969 [81] 88498 88823 89036 89956 90530 91224 91384 92260 95016 95197 [91] 100308 101466 101535 102027 102862 103148 104929 105291 107751 108053 [101] 111306 111834 113187 116803 116900 117614 120823 121134 121894 125920 [111] 126726 128978 129257 133844 134437 135775 136279 137792 138410 138554 [121] 139772 142294 143500 144617 144974 145647 145887 146089 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" > cat('Time elapsed: ', proc.time() - .proctime00,'\n') Time elapsed: 4.313 0.009 4.328 0 0 > ## nb-mm, with cluster 1.6-4 : 85.2 (elapsed 91.7) > > ###-- Larger example: 20*N objects, divided into 5 clusters. > N <- 20000 # 100'000 would give swapping like crazy for 256 MB RAM > x5 <- rbind(cbind(rnorm(4*N, 0,4), rnorm(4*N, 0,4)), + cbind(rnorm(4*N,10,8), rnorm(4*N,40,6)), + cbind(rnorm(4*N,30,4), rnorm(4*N, 0,4)), + cbind(rnorm(4*N,40,4), rnorm(4*N,20,2)), + cbind(rnorm(4*N,50,4), rnorm(4*N,50,4))) > ## plus 1 random dimension > x5 <- cbind(x5, rnorm(nrow(x5))) > > .proctime00 <- proc.time() > for(nn in 1:3) + print(clara(x5[sample(nrow(x5)),], 5, samples= 100)) Call: clara(x = x5[sample(nrow(x5)), ], k = 5, samples = 100) Medoids: [,1] [,2] [,3] [1,] 51.1002305 50.1460959 0.1331874 [2,] 9.1252025 39.6689092 -0.4555514 [3,] 29.5625766 1.1976994 -0.9082990 [4,] 39.8055168 20.2326953 0.5889404 [5,] 0.2465901 0.8764563 -0.6630726 Objective function: 5.769338 Clustering vector: int [1:400000] 1 2 3 3 1 4 4 3 2 3 1 2 4 1 3 3 1 1 ... Cluster sizes: 80240 79099 79851 80781 80029 Best sample: [1] 13178 26722 33924 43964 45752 50483 67536 75464 80530 106348 [11] 108637 110810 127118 129499 142328 144721 149683 164087 224817 234113 [21] 237732 238459 242896 248731 260828 274946 275422 277540 285279 294501 [31] 307300 316791 316944 327656 328492 332435 334339 338190 346882 350599 [41] 353437 357020 358844 360273 364637 376227 378626 379767 388135 392804 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x5[sample(nrow(x5)), ], k = 5, samples = 100) Medoids: [,1] [,2] [,3] [1,] 30.0804463 -0.9518318 -0.3667627 [2,] 40.4359809 19.4916731 0.5483652 [3,] -0.8077057 0.4248833 0.1078008 [4,] 10.5285200 41.9423158 0.7195955 [5,] 49.6862611 49.7783134 0.1508043 Objective function: 5.760039 Clustering vector: int [1:400000] 1 2 2 3 1 2 1 1 1 3 3 3 2 1 4 2 5 2 ... Cluster sizes: 79697 80784 80058 79206 80255 Best sample: [1] 43488 47107 47834 50916 52271 70203 84321 84797 86915 116675 [11] 126166 126319 130451 137031 137867 141810 143714 147565 152802 156257 [21] 166395 168219 169648 174012 188001 189142 192475 197510 202179 214356 [31] 236097 243299 253339 255127 259858 275190 276911 282416 289905 294117 [41] 314289 315723 318012 320185 331080 336493 338874 351703 359058 373462 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" Call: clara(x = x5[sample(nrow(x5)), ], k = 5, samples = 100) Medoids: [,1] [,2] [,3] [1,] 10.6835651 40.49264498 0.37786959 [2,] 0.7370939 -0.09727728 -0.52975660 [3,] 30.9806583 -0.03202827 -0.12876557 [4,] 39.1749167 20.45583496 -0.08582161 [5,] 49.4757529 49.73885441 0.25604496 Objective function: 5.725015 Clustering vector: int [1:400000] 1 2 3 2 4 5 2 2 5 4 4 3 5 3 2 2 1 2 ... Cluster sizes: 79138 80053 79818 80737 80254 Best sample: [1] 3217 11579 28962 51465 59754 61371 67371 80274 80390 86847 [11] 91193 95063 110010 113007 119678 120270 128156 136658 152417 166822 [21] 173810 182233 190870 196698 227680 232502 235169 237073 245539 245630 [31] 248487 253705 259870 263544 269526 277717 279853 281360 309083 310035 [41] 329456 331226 335834 340467 344275 346692 363563 383265 390925 399616 Available components: [1] "sample" "medoids" "i.med" "clustering" "objective" [6] "clusinfo" "diss" "call" "silinfo" "data" > ## Last Line: > cat("Time elapsed: ", proc.time() - .proctime00,"\n") Time elapsed: 4.214 0.022 4.242 0 0 > ## nb-mm, with cluster 1.6-4 : 74.4 (elapsed 88.2) > ## ~~~~~ 1.7-* quite faster : 67.6 (elapsed 68.7) > > proc.time() user system elapsed 8.860 0.141 9.197