### This file is part of the 'foreign' package for R. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # http://www.r-project.org/Licenses/ read.dta <- function(file, convert.dates = TRUE, convert.factors = TRUE, missing.type = FALSE, convert.underscore = FALSE, warn.missing.labels = TRUE) { if(length(grep("^(http|ftp|https)://", file))) { tmp <- tempfile() download.file(file, tmp, quiet = TRUE, mode = "wb") file <- tmp on.exit(unlink(file)) } rval <- .External(do_readStata, file) if(convert.underscore) names(rval) <- gsub("_", ".", names(rval)) types <- attr(rval, "types") stata.na <- data.frame(type = 251L:255L, min = c(101, 32741, 2147483621, 2^127, 2^1023), inc = c(1,1,1,2^115,2^1011) ) if(!missing.type) { if (abs(attr(rval, "version")) >= 8L) { for(v in which(types > 250L)) { this.type <- types[v] - 250L rval[[v]][rval[[v]] >= stata.na$min[this.type]] <- NA } } } else { if (abs(attr(rval, "version")) >= 8L) { missings <- vector("list", length(rval)) names(missings) <- names(rval) for(v in which(types > 250L)) { this.type <- types[v] - 250L nas <- is.na(rval[[v]]) | rval[[v]] >= stata.na$min[this.type] natype <- (rval[[v]][nas] - stata.na$min[this.type])/stata.na$inc[this.type] natype[is.na(natype)] <- 0L missings[[v]] <- rep(NA, NROW(rval)) missings[[v]][nas] <- natype rval[[v]][nas] <- NA } attr(rval,"missing") <- missings } else warning("'missing.type' only applicable to version >= 8 files") } convert_dt_c <- function(x) as.POSIXct((x+0.1)/1000, origin = "1960-01-01") # avoid rounding down convert_dt_C <- function(x) { ls <- .leap.seconds + seq_along(.leap.seconds) + 315619200 z <- (x+0.1)/1000 # avoid rounding down z <- z - rowSums(outer(z, ls, ">=")) as.POSIXct(z, origin = "1960-01-01") } if (convert.dates) { ff <- attr(rval, "formats") ## dates <- grep("%-*d", ff) ## Stata 12 introduced 'business dates' ## 'Formats beginning with %t or %-t are Stata's date and time formats.' ## but it seems some are earlier. ## The dta_115 description suggests this is too inclusive: ## 'Stata has an old *%d* format notation and some datasets ## still have them. Format *%d*... is equivalent to modern ## format *%td*... and *%-d*... is equivalent to *%-td*...' dates <- if (attr(rval, "version") >= 8L) grep('^%(-|)(d|td)', ff) else grep("%-*d", ff) ## avoid as.Date in case strptime is messed up base <- structure(-3653L, class = "Date") # Stata dates are integer vars for(v in dates) rval[[v]] <- structure(base + rval[[v]], class = "Date") for(v in grep("%tc", ff)) rval[[v]] <- convert_dt_c(rval[[v]]) for(v in grep("%tC", ff)) rval[[v]] <- convert_dt_C(rval[[v]]) } if (convert.factors %in% c(TRUE, NA)) { if (attr(rval, "version") == 5L) warning("cannot read factor labels from Stata 5 files") else { ll <- attr(rval, "val.labels") tt <- attr(rval, "label.table") factors <- which(ll != "") for(v in factors) { labels <- tt[[ll[v]]] if (warn.missing.labels && is.null(labels)) { warning(gettextf("value labels (%s) for %s are missing", sQuote(ll[v]), sQuote(names(rval)[v])), domain = NA) next } if(!is.na(convert.factors)) { ## some levels don't have labels, so skip if (!all(rval[[v]] %in% c(NA, NaN, tt[[ll[v]]]))) next } rval[[v]] <- factor(rval[[v]], levels=tt[[ll[v]]], labels=names(tt[[ll[v]]])) } } } att <- attributes(rval) ##rval <- as.data.frame(rval, stringsAsFactors=FALSE) class(rval) <- "data.frame" newatt <- attributes(rval) newatt <- c(newatt, att[!(names(att) %in% names(newatt))]) attributes(rval) <- newatt rval } write.dta <- function(dataframe, file, version = 7L, convert.dates = TRUE, tz = "GMT", convert.factors = c("labels","string","numeric","codes")) { if(!is.data.frame(dataframe)) stop("The object \"dataframe\" must have class data.frame") if (version < 6L) stop("Version must be 6-12") if (version == 9L) version <- 8L if (version == 11L) version <- 10L if (version == 12L) version <- 10L if (version > 12L) { warning("Version must be 6-12: using 7") version <- 7L } ## assume this is in chars: probably only works for ASCII ## But Stata formats are ASCII only namelength <- if (version == 6L) 8L else 31L oldn <- names(dataframe) nn <- abbreviate(oldn, namelength) if (any(nchar(nn) > namelength)) stop("cannot uniquely abbreviate variable names") if (any(nchar(oldn) > namelength)) warning("abbreviating variable names") names(dataframe) <- nn attr(dataframe,"orig.names") <- oldn if (convert.dates) { dates <- which(vapply(dataframe, function(x) inherits(x, "Date"), NA)) for(v in dates) dataframe[[v]] <- as.vector(julian(dataframe[[v]], as.Date("1960-1-1", tz="GMT"))) dates <- which(vapply(dataframe, function(x) inherits(x, "POSIXt"), NA)) for(v in dates) dataframe[[v]] <- as.vector(round(julian(dataframe[[v]], ISOdate(1960,1,1, tz=tz)))) ## It would be possible to write these as %tc format, ## milliseconds since 01jan1960 00:00:00.000 ## dataframe[[v]] <- 1000*as.vector(as.POSIXct(dataframe[[v]], tz=tz) + 315619200) } convert.factors <- match.arg(convert.factors) factors <- which(vapply(dataframe, is.factor, NA)) if(convert.factors == "string") { for(v in factors) dataframe[[v]] <- I(as.character(dataframe[[v]])) } else if (convert.factors == "numeric") { for(v in factors) dataframe[[v]] <- as.numeric(as.character(dataframe[[v]])) } else if (convert.factors == "codes") { for (v in factors) dataframe[[v]] <- as.numeric(dataframe[[v]]) } shortlevels <- function(f) { ll <- levels(f) if (is.null(ll)) return(NULL) ## avoid warning if non-ASCII strings are used (unwisely) if (all(nchar(ll, "bytes") <= 80L)) ll else abbreviate(ll, 80L) } leveltable <- lapply(dataframe, shortlevels) if (any(vapply(dataframe, function(x) { d <- dim(x) !is.null(d) && d[1L] < length(x) }, NA))) stop("cannot handle multicolumn columns") invisible(.External(do_writeStata, file, dataframe, version, leveltable)) }