Data table

Un paquet pour faciliter et améliorer l’efficacité de certains opérations en R

library(data.table)
mydf=data.frame(a=rep(LETTERS,each=1e5),b=rnorm(26*1e5))
mydt=data.table(mydf)
setkey(mydt,a) # Nous spécifions la colonne qui servira de clé pour la table de données

Retourne toutes les lignes où la colonne a (la clé) est égale à F

mydt['F']
##         a          b
##      1: F  0.1590119
##      2: F  1.2614807
##      3: F -0.2129390
##      4: F -0.8146843
##      5: F  0.4315091
##     ---             
##  99996: F  0.1339031
##  99997: F  1.2745034
##  99998: F -1.5770690
##  99999: F -0.6781153
## 100000: F -1.7904666

Donne la valeur moyenne de la colonne b, pour chaque lettre de la colonne a

mydt[,mean(b),by=a]
##     a            V1
##  1: A  0.0045139147
##  2: B  0.0025196408
##  3: C -0.0008929497
##  4: D  0.0007866199
##  5: E  0.0039362475
##  6: F -0.0021211539
##  7: G -0.0028668996
##  8: H  0.0041853159
##  9: I  0.0010672872
## 10: J  0.0057031742
## 11: K  0.0010285282
## 12: L  0.0006769392
## 13: M  0.0090716903
## 14: N  0.0007348428
## 15: O -0.0016730781
## 16: P  0.0060996369
## 17: Q  0.0041903877
## 18: R  0.0014506376
## 19: S -0.0021115605
## 20: T -0.0025958100
## 21: U -0.0001874106
## 22: V  0.0061811935
## 23: W  0.0027733812
## 24: X -0.0022288589
## 25: Y  0.0026512241
## 26: Z -0.0015411116
##     a            V1

Comparez

system.time(t1<-mydt[,mean(b),by=a])
##    user  system elapsed 
##   0.029   0.000   0.028

Avec tapply()

system.time(t2<-tapply(mydf$b,mydf$a,mean))
##    user  system elapsed 
##   0.162   0.016   0.178

Avec reshape2

library(reshape2)
meltdf=melt(mydf)
system.time(t3<-dcast(meltdf,a~variable,mean))
##    user  system elapsed 
##   0.759   0.036   0.818

Avec plyr

library(plyr)
system.time(t4<-ddply(mydf,.(a),summarize,mean(b)))
##    user  system elapsed 
##   0.195   0.028   0.222

Avec dplyr

library(dplyr)
ti1<-proc.time()
groups <- group_by(mydf, a)
t4b <- summarise(groups, total = mean(b))
eltime<-proc.time()-ti1
eltime
##    user  system elapsed 
##   0.081   0.000   0.085

Avec sqldf

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
system.time(t5<-sqldf('SELECT a, avg(b) FROM mydf GROUP BY a'))
## Loading required package: tcltk
##    user  system elapsed 
##   4.662   0.223   4.906

Avec une bouvle “for”

ti1<-proc.time()
t6<-data.frame(letter=unique(mydf$a),mean=rep(0,26))
for (i in t6$letter ){
  t6[t6$letter==i,2]=mean(mydf[mydf$a==i,2])
}
eltime<-proc.time()-ti1
eltime
##    user  system elapsed 
##   6.118   0.069   6.195

Avec une boucle FOR parallélisée

library(foreach)
library(doMC)
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(4) #Processeur quatres coeurs
ti1<-proc.time()
t7<-data.frame(letter=unique(mydf$a),mean=rep(0,26))
t7[,2] <- foreach(i=t7$letter, .combine='c') %dopar% {
 mean(mydf[mydf$a==i,2])
}
eltime<-proc.time()-ti1
eltime
##    user  system elapsed 
##   4.780   0.141   2.224

RgoogleMaps

library(RgoogleMaps)
myhome=getGeoCode('Olympic Stadium, Montreal');
mymap<-GetMap(center=myhome, zoom=14)
PlotOnStaticMap(mymap,lat=myhome['lat'],lon=myhome['lon'],cex=5,pch=10,lwd=3,col=c('red'));