tpccdata = function () { # read raw data: tpc = read.csv("tpcc.txt") # exclude a few problematic observations: tpc = tpc[tpc$isWithdrawn=="", ] tpc = tpc[tpc$frontEnds!="N", ] tpc = tpc[!is.na(as.numeric(as.character(tpc$cpus))), ] # re-code some variables tpc$frontEnds = as.numeric(as.character(tpc$frontEnds)) tpc$cpus = as.numeric(as.character(tpc$cpus)) tpc$specRevision = as.factor(tpc$specRevision) # eliminate superfluous variables: tpc$isWithdrawn = NULL tpc$withdrawn = NULL tpc$availability = NULL # introduce some new coarse-grained variables: tpc$ostype = rep(length=nrow(tpc), "other") tpc$ostype[grep("Microsoft", as.character(tpc$os))] = "Windows" tpc$ostype[grep("AIX|Linux|Solaris|Tru64|UX", as.character(tpc$os))] = "Unix" tpc$ostype = as.factor(tpc$ostype) tpc$cputype = rep(length=nrow(tpc), "other") tpc$cputype[grep("Alphachip", as.character(tpc$cpu))] = "Alpha" tpc$cputype[grep("SPARC64", as.character(tpc$cpu))] = "SPARC64" tpc$cputype[grep("UltraSPARC", as.character(tpc$cpu))] = "UltraSPARC" tpc$cputype[grep("Opteron", as.character(tpc$cpu))] = "Opteron" tpc$cputype[grep("PA-RISC", as.character(tpc$cpu))] = "PA-RISC" tpc$cputype[grep("Power", as.character(tpc$cpu))] = "Power" tpc$cputype[grep("RS64", as.character(tpc$cpu))] = "RS64" tpc$cputype[grep("Itanium ?2", as.character(tpc$cpu))] = "Itanium2" tpc$cputype[grep("Xeon", as.character(tpc$cpu))] = "Xeon" tpc$cputype[grep("Pentium", as.character(tpc$cpu))] = "Pentium3" tpc$cputype = as.factor(tpc$cputype) tpc$tpmon = rep(length=nrow(tpc), "other") tpc$tpmon[grep("Tuxedo", as.character(tpc$tpMonitor))] = "Tuxedo" tpc$tpmon[grep("COM+", as.character(tpc$tpMonitor))] = "COM+" tpc$tpmon[grep("Connector", as.character(tpc$tpMonitor))] = "Connector" tpc$tpmon[grep("Webs[hp][hp]ere", as.character(tpc$tpMonitor))] = "Websphere" tpc$tpmon = as.factor(tpc$tpmon) tpc$freq = as.numeric(gsub("^.* ([0-9\\.]+) *[MG][Hh]z.*$","\\1", as.character(tpc$cpu))) frequnit = rep(length=nrow(tpc), "other") frequnit[grep("MHz", as.character(tpc$cpu), ignore.case=T)] = "MHz" frequnit[grep("GHz", as.character(tpc$cpu), ignore.case=T)] = "GHz" tpc$freq = tpc$freq * ifelse(frequnit=="GHz", 1000, 1) # in MHz invisible(tpc[!is.na(tpc$freq), ]) }