#### Regression Tests that need "much" memory
#### (and / or  are slow even with enough GBytes of memory)

print(si <- sessionInfo(), locale=FALSE)
Sys.info()
## Run (currently _only_)  when inside tests/  by
'
time   make test-large
' # giving ~ 35 min [R-devel 2019-01]

## From CRAN package 'sfsmisc':
Sys.memGB <- function (kind = "MemTotal")
{
    mm <- drop(read.dcf("/proc/meminfo", fields = kind))
    if (any(is.na(mm))) {
        warning("Non-existing 'kind': ", names(mm)[is.na(mm)][1])
        0
    } else if (!all(grepl(" kB$", mm)))  {
        warning("Memory info ", dQuote(kind),
                " is not returned in 'kB' aka kiloBytes")
        0
    } else
        as.numeric(sub(" kB$", "", mm))/(1000 * 1024)
}

availableGB <-
    if(file.exists("/proc/meminfo")) { # e.g. on Linux
	Sys.memGB("MemAvailable")
    } else {
	0 # unless we add something better here
    }
cat("Available (processor aka CPU) memory: ", round(availableGB, 1),
    "GB (Giga Bytes)\n")

if(.Machine$sizeof.pointer < 8) {
    cat(".Machine :\n"); str(.Machine)
    cat("not a 64-bit system -- forget about these tests!\n")
    q("no")
}

### Testing  readLines()  *large* file with embedded nul aka `\0'
##
## takes close to one minute and ~ 10 GB RAM
if(availableGB > 11) local(withAutoprint({
    ## File construction originally by Bill Dunlap, Cc: R-help,
    ##    Subject: Re: [R] readLines without skipNul=TRUE causes crash
    ##    Date: Mon, 17 Jul 2017 08:36:55 -0700
    tf <- tempfile(); file <- file(tf, "wb")
    txtLine <- c(rep(as.raw(32:127), 2^5), charToRaw("\n")) # <- want many lines
    system.time({
        for(i in 1:(2^15-1)) writeBin(rep_len(txtLine,    2^16), file)
        for(i in 1:(2^15-1)) writeBin(rep_len(as.raw(0L), 2^16), file)
    })
    close(file)
    log2(file.size(tf)) ## 31.99996
    ## now, this gave a segmentation fault, PR#17311 :
"FIXME: on 32-bit Linux (F 24), still see
Program received signal SIGSEGV, Segmentation fault.
... in do_readLines (call=0x8.., op=0x8.., ....)
    at ../../../R/src/main/connections.c:3852
3852		    if(c != '\n') buf[nbuf++] = (char) c; else break;
"
  if(.Machine$sizeof.pointer > 4) withAutoprint({
    system.time( x <- readLines(tf) ) # depending on disk,.. takes 15-50 seconds
    ##                ---------
    str(ncx <- nchar(x, "bytes"))
    ## int [1:688108] 3072 3072 3072 3072 3072 3072 3072 3072 ...
    tail(ncx) # ... 3072 3072 3072 1003
    table(ncx) # mostly 3072, then some 4075 and the last one
    head(iL <- which(ncx == 4075))
    stopifnot(diff(iL) == 21)
  }) else cat("32-bit: still seg.faulting - FIXME\n")
}))
## + 2 warnings


### Testing PR#17992  c() / unlist() name creation for large vectors
## Part 1
if(availableGB > 21) system.time({
    res <- c(a=raw(2), raw(2^31-1))
}) ## 36--44 sec elapsed (ada-16, ~ 120 GB available) after fix
## In R <= 3.4.1, took  51 sec elapsed, and gave Error .. :
##  attempt to set index 18446744071562067968/2147483649 in SET_STRING_ELT
##
if(FALSE) { # object.size() itself is taking a lot of time!
    os <- object.size(res)
} else {
    os <- structure(19327353184, class = "object_size")
    print(os, units = "GB") # 18
}
if(exists("res")) rm(res)
gc(reset = TRUE) # for the next step

### Testing PR#17992  c() / unlist() name creation for large vectors
## Part 2 (https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17292#c4):
if(availableGB > 37) system.time({
    res <- c(a = list(rep(c(b=raw(1)), 2^31-2), raw(2)), recursive=TRUE)
})
## 437 sec elapsed (ada-16, ~ 120 GB available) after fix; then ada-20: 566 sec
## In R <= 3.4.1, took  475 sec  elapsed, and gave Error .. :
##    could not allocate memory (2048 Mb) in C function 'R_AllocStringBuffer'
## ((and that error msg is incorrect because of int overflow))
if(exists("res")) withAutoprint({
str(res) # is fast!
## Named raw [1:2147483648] 00 00 00 00 ...
## - attr(*, "names")= chr [1:2147483648] "a.b" "a.b" "a.b" "a.b" ...
gc() # back to ~ 18.4 GB
rm(res)
})
gc(reset = TRUE) # for the next step

## Large string's encodeString() -- PR#15885
if(availableGB > 4) system.time(local(withAutoprint({
    txt <- strrep("test me:", 53687091); object.size(txt) # 429'496'824 bytes
    nc <- nchar(txt) ## NB this is larger than maximal integer:
    nc*5L+8L # NA + Warning   'NAs produced by integer overflow'
    en <- encodeString(txt)
    ## encodeString() seg.faulted in R <= 3.4.1
    stopifnot(identical(txt,en)) # encoding did not change simple ASCII
})))
## 52 sec elapsed [nb-mm4, 8 GB]; then 66.7 [ada-20; much more GB]


## pretty(x, n) for n = <large> or  large diff(range(x) gave overflow in C code
if(availableGB > 6) system.time(withAutoprint({
    r <- pretty(c(-1,1)*1e300, n = 449423288, min.n = 1)
    head(r) ; length(r) # was only 21 in  R < 3.5.0
    stopifnot(all.equal(length(r), 400000001, tol = 0.1))
})) ## 4.8--5.5 sec.
rm(r)
gc()

n <- 4e4 # << for quick testing, comment next line
n <- 2.2e9

if(availableGB > 60) withAutoprint({
    n/.Machine$integer.max  # 1.024 ==> need  long vectors!
    ii <- seq_len(n)          #   user  system elapsed  [seq_len() fast: ALTREP "compact"]
    system.time(ii <- ii + 0) #  6.726  17.558  24.450 (slow!, seen faster)
    system.time(i2 <- ii[-n]) # 14.267  23.532  37.918 (slow!, seen slower: el.= 51)
    ##
    ## NB: keep n, i, i2 for "below"
})
## In R <= 3.4.1 :
## Program received signal SIGSEGV, Segmentation fault.
## 0x00000000005a0daf in realSubscript (call=0x3f01408, stretch=<optimized out>,
##     nx=2200000000, ns=1, s=0x426db18) at ../../../R/src/main/subscript.c:691
## 691			    LOGICAL(indx)[ix] = 0;

if(availableGB > 99) withAutoprint({
    system.time( x <- ii/n )            #   5.45 user; 11.5--14.36 elapsed
    system.time( y <- sin(pi*x) )       #  42 user; 48.9--..  elapsed
    system.time(sorted <- !is.unsorted(x)) # ~ 4 elapsed
    stopifnot(sorted)
    ## default n (= "nout") = 50:
    system.time(ap1 <- approx(x,y, ties = "ordered"))# 15 user; 25 elapsed
    stopifnot(exprs = {
	is.list(ap1)
	names(ap1) == c("x","y")
	length(ap1$x) == 50
	all.equal(ap1$y, sin(pi*ap1$x), tol= 1e-9)
    })
    rm(ap1) # keep x,y,n,i2
    gc()     # --> max used: 92322 Mb
})

## which() and ifelse() working for long vectors
if(availableGB > 165) withAutoprint({
    system.time(iis <- which(isMl <- ii < 9999)) # 5.8 user,  8.8 elapsed
    gc() # 59 GB max used
    system.time(r <- ifelse(isMl, ii, ii*1.125)) #        user  system elapsed
    stopifnot(exprs = {                 # in R 3.5.2 : 124.989 174.726 300.656
	## GB's ifelse() + using which(<long>) 3.6.0 :  71.815  81.823 154.124
	length(r) == n
        iis == seq_len(9998)
    })
    rm(isMl, iis, r)
})
gc() # 159 GB max used

if(availableGB > 211) withAutoprint({ ## continuing from above
    ## both large (x,y) *and* large output (x,y):
    system.time(xo <- x + 1/(2*n))     # ~ 9 elapsed
    system.time(ap <- approx(x,y, ties = "ordered", xout = xo))
                                       # 194 user, 214--500 elapsed
    gc(reset = TRUE) # showing max.used ~ 1..... Mb
    stopifnot(exprs = {
	is.list(ap)
	names(ap) == c("x","y")
	length(ap$x) == n
	is.na(ap$y[n]) # because ap$x[n] > 1, i.e., outside of [0,1]
	all.equal(ap$y[i2], sin(pi*xo[i2]), tol= if(n < 1e7) 1e-8 else 1e-15)
    })
    rm(ap); gc() # showing used 83930 Mb | max.used 210356.6 Mb
    ## only large x,y :
    system.time(apf <- approxfun(x,y, ties="ordered", rule = 2))# elapsed: ~26s
    xi <- seq(0, 1, by = 2^-12) ## linear interpol. is less accurate than spline:
    stopifnot(all.equal(apf(xi), sin(pi*xi), tol= if(n < 1e7) 1e-7 else 1e-11))
    rm(apf); gc() # (~ unchanged)
    system.time(ssf <- splinefun(x,y, ties = "ordered"))
                                        # elapsed 120 s; using ~ 158 GB
    system.time(ss  <- spline   (x,y, ties = "ordered", xout = xi))
                                        # elapsed 126--265 s; using ~ 207 GB
    gc()
    stopifnot(exprs = {
	is.list(ss)
	names(ss) == c("x","y")
	length(ss$y) == length(xi)
	all.equal(ss$y   , sin(pi*xi), tol= 1e-15)
	all.equal(ssf(xi), ss$y,       tol= 1e-15)
    })
    rm(x, y, xo, ss, ssf) # remove long vector objects
    gc(reset=TRUE)
})

## sum(<Integer|Logical>) -- should no longer overflow: ----------------------------------------
## 1) sum(<long logical>) == counting
if(availableGB > 24) withAutoprint({
    system.time(L <- rep.int((0:15) %% 7 == 2, 2^28))# -> length 2^32; ~ 22 sec
    print(object.size(L), unit="GB") # 16 GB
    system.time(sL <- sum(L)) # 8.4 sec
    stopifnot(exprs = {
        is.logical(L)
        length(L) == 2^32
        !is.integer(length(L))
        is.integer(sL)
        identical(sL, as.integer(2^29))
    })
}) ## sL would be NA with an "integer overflow" warning in R <= 3.4.x
gc(reset=TRUE)

## 2) many (and relatively long and large) integers
L <- as.integer(2^31 - 1)## = 2147483647L = .Machine$integer.max ("everywhere")
## a "small" example with this is in ./reg-tests-1d.R (see 'x24')
if(availableGB > 12) withAutoprint({
    system.time(x31 <- rep.int(L, 2^31+1)) # sum = 2^62 - 1 =.= 2^62 // ~ 5.5 sec
    print(object.size(x31), unit = "GB") # 8 G
    system.time(S <- sum(x31)) # ~ 2 sec
    system.time(S.4 <- sum(x31, x31, x31, x31)) # 8 sec
    stopifnot(is.integer(x31),
              identical(S,   2^62),
              identical(S.4, 2^64))
    system.time(x32 <- c(x31, x31)) # 13 user | 20.8 elapsed  (and 16 GB)
    rm(x31)# now,  sum vvv  will switch to use irsum() [double accumulator]
    system.time(S.2 <- sum(x32)) # 8 sec
    stopifnot(S.2 == 2^63)
    rm(x32)
})


## seq() remaining integer: (PR 17497, comment #9)
if(availableGB > 16) withAutoprint({
    i <- as.integer(2^30)
    system.time(i2.31 <- seq(-i, by=1L, length=2*i+1)) # 11.1 user | 19.2 elapsed
    object.size(i2.31) # 8'589'934'648 bytes [ was 17.17 GB in R <= 3.5.x ]
    stopifnot(is.integer(i2.31),  i2.31[1] == -i,  i2.31[length(i2.31)] == i)

    ## pmax(), pmin() with long vectors, PR 17533
    if(availableGB > 24) withAutoprint({
        system.time(i2.31 <- pmin(i2.31, 0L)) # 7.2 sec user | 11.2 elapsed
        str(i2.31)
        system.time(stopifnot(i2.31[(i+1):length(i2.31)] == 0)) # 16.7 user | 28.0 elapsed
    })
})


## match(<long character>, *)  PR#17552
if(availableGB > 44) withAutoprint({ ## seen 40 G ('RES')
    system.time(m <- match(rep("a", 2^31), "a")) # 34.7 sec user (55 elapsed)
    stopifnot(all(m == 1L))
    rm(m)
    system.time({x <- character(2^31); x[26:1] <- letters }) # 1.6 user | 9.4 elapsed
    system.time(m <- match(x, "a"))# 18.2 user | 51.6 elapsed
    head(m, 30)
    system.time(stopifnot(m[26] == 1L, is.na(m[-26])))
    rm(x, m)
})


## readBin() and writeBin() for long rawConnection s, PR#17665
## -------       --------            -------------
if(availableGB > 14) withAutoprint({ ## seen 11.6 G
    vec <- rep(0, 3e8) # object.size(vec) > 2^31
    raw_con <- rawConnection(serialize(vec, NULL)) # ~ 5 sec.
    ## Stepping through this connection gives an error after the 2^31st element:
    repeat {
        x <- readBin(raw_con, "raw", n = 1e+06)
        if(length(x) == 0)
            break
        cat(".")
    }; cat("\n")
    ## Error in readBin(raw_con, "raw", n = 1e+06) : too large a block specified
})

## writeBin() for long vectors
if(availableGB > 20) withAutoprint({ ## seen 20.9 G
    x <- raw(2^31)
    writeBin(x, con = nullfile())

    con <- rawConnection(raw(0L), "w")
    writeBin(x, con = con)
    stopifnot(identical(x, rawConnectionValue(con)))

    system.time(x <- pi*seq_len(2.1*2^30)) # 25 sec
    zzfil <- tempfile("test-large-bin")
    zz <- file(zzfil, "wb") ## file size will be 2.5 GB !!!
    system.time(z <- writeBin(x, zz)) # 32 sec
    stopifnot(is.null(z))
    close(zz); zz <- file(zzfil, "rb")
    system.time(r <- readBin(zz, double(), n = length(x) + 999)) # 32 sec
    system.time(stopifnot(identical(x, r))) # 24 sec
    close(zz); rm(r, zz)
})


## predict(loess(.), se=TRUE) for "large" sample size -- PR#17121
## No need for very much memory, but is slow and should do several ex.
mkDat <- function(n) {
    x <- 5*(1:n)/(n+1)
    data.frame(x = x, y = sin(pi*x^2) * exp(-x/2) + rnorm(n)/8)
}
set.seed(1); dat <- mkDat(n = 42000)
system.time( # 14.5 sec (on lynne ~ 2019)
    fit <- loess(y~x, data=dat)
)
r <- tools::assertError(
   predict(fit, newdata=data.frame(x=.5), se=TRUE)
 , verbose=TRUE) #
## typically would not seg.fault but give Calloc(..) error (with *wrong* size)
stopifnot(grepl("^workspace .* is too large .* 'se = TRUE'", r[[1]]$message))



gc() # NB the "max used"
proc.time() # total  [ ~ 40 minutes in full case, 2019-04-12]