Robin Gower Infonomics
2015-08-24 Manchester R
Strategies allow you to:
R can match a functional implementation dynamically to the class of it's argument but not it's result.
Plyr uses duck-typing so we have a family of functions to choose the splitting method.
Input / Output | Array | Data frame | List | Discarded |
---|---|---|---|---|
Array | aaply | adply | alply | a_ply |
Data frame | daply | ddply | dlply | d_ply |
List | laply | ldply | llply | l_ply |
n replicates | raply | rdply | rlply | r_ply |
function arguments | maply | mdply | mlply | m_ply |
a*ply(.data, .margins, .fun, ..., .progress = "none")
d*ply(.data, .variables, .fun, ..., .progress = "none")
l*ply(.data, .fun, ..., .progress = "none")
n*ply(.n, .expr, .progress = "none", .drop = TRUE)
maply(.data, .fun = NULL, ..., .expand = TRUE, .progress = "none")
c("var1", "var2")
.(var1)
or .(a b c)
.(round(a))
or .(product = a * b)
~ a + b
sum
, mean
, nrow
, length
bnames <- read.csv("baby-names/baby-names-1.csv")
head(bnames)
Name Count Year Gender
1 AMELIA 5327 2014 Girl
2 OLIVIA 4724 2014 Girl
3 ISLA 4012 2014 Girl
4 EMILY 3991 2014 Girl
5 POPPY 3273 2014 Girl
6 AVA 3171 2014 Girl
tail(bnames)
Name Count Year Gender
68061 ZONAIN 3 2010 Boy
68062 ZORAWAR 3 2010 Boy
68063 ZUBEYR 3 2010 Boy
68064 ZUHAIB 3 2010 Boy
68065 ZYAH 3 2010 Boy
68066 ZYAN 3 2010 Boy
bnames$Name <- trimws(bnames$Name)
bnames$Length <- nchar(as.character(bnames$Name))
bnames$FirstLetter <- substr(bnames$Name,1,1)
head(bnames)
Name Count Year Gender Length FirstLetter
1 AMELIA 5327 2014 Girl 6 A
2 OLIVIA 4724 2014 Girl 6 O
3 ISLA 4012 2014 Girl 4 I
4 EMILY 3991 2014 Girl 5 E
5 POPPY 3273 2014 Girl 5 P
6 AVA 3171 2014 Girl 3 A
tail(bnames)
Name Count Year Gender Length FirstLetter
68061 ZONAIN 3 2010 Boy 6 Z
68062 ZORAWAR 3 2010 Boy 7 Z
68063 ZUBEYR 3 2010 Boy 6 Z
68064 ZUHAIB 3 2010 Boy 6 Z
68065 ZYAH 3 2010 Boy 4 Z
68066 ZYAN 3 2010 Boy 4 Z
girl11 <- subset(bnames, Gender=="Girl" & Year==2011)
head(girl11)
Name Count Year Gender Length FirstLetter
40988 AMELIA 5054 2011 Girl 6 A
40989 OLIVIA 4938 2011 Girl 6 O
40990 LILY 4761 2011 Girl 4 L
40991 JESSICA 3984 2011 Girl 7 J
40992 EMILY 3974 2011 Girl 5 E
40993 SOPHIE 3923 2011 Girl 6 S
tail(girl11)
Name Count Year Gender Length FirstLetter
48557 ZUBAYDAH 3 2011 Girl 8 Z
48558 ZUBEYDE 3 2011 Girl 7 Z
48559 ZUKHRUF 3 2011 Girl 7 Z
48560 ZULAIKA 3 2011 Girl 7 Z
48561 ZULAL 3 2011 Girl 5 Z
48562 ZUZIA 3 2011 Girl 5 Z
girl11$Percent <- girl11$Count / sum(girl11$Count)
girl11$Rank <- rank(-girl11$Count, ties.method="min")
head(girl11[,c("Name","Count","Percent","Rank")])
Name Count Percent Rank
40988 AMELIA 5054 0.01572911 1
40989 OLIVIA 4938 0.01536810 2
40990 LILY 4761 0.01481724 3
40991 JESSICA 3984 0.01239905 4
40992 EMILY 3974 0.01236793 5
40993 SOPHIE 3923 0.01220920 6
tail(girl11[,c("Name","Count","Percent","Rank")])
Name Count Percent Rank
48557 ZUBAYDAH 3 9.336632e-06 5785
48558 ZUBEYDE 3 9.336632e-06 5785
48559 ZUKHRUF 3 9.336632e-06 5785
48560 ZULAIKA 3 9.336632e-06 5785
48561 ZULAL 3 9.336632e-06 5785
48562 ZUZIA 3 9.336632e-06 5785
library(plyr)
bnames <- ddply(bnames, c("Gender","Year"), transform, Rank=rank(-Count, ties.method="min"))
bnames <- ddply(bnames, c("Gender","Year"), transform, Percent=Count/sum(Count))
head(bnames)
Name Count Year Gender Length FirstLetter Rank Percent
1 OLIVER 8427 2010 Boy 6 O 1 0.02433446
2 JACK 7031 2010 Boy 4 J 2 0.02030326
3 HARRY 6862 2010 Boy 5 H 3 0.01981525
4 ALFIE 5478 2010 Boy 5 A 4 0.01581870
5 CHARLIE 5410 2010 Boy 7 C 5 0.01562234
6 THOMAS 5307 2010 Boy 6 T 6 0.01532491
tail(bnames)
Name Count Year Gender Length FirstLetter Rank Percent
68061 ZOHHA 3 2014 Girl 5 Z 5691 9.782566e-06
68062 ZONERA 3 2014 Girl 6 Z 5691 9.782566e-06
68063 ZULEKHA 3 2014 Girl 7 Z 5691 9.782566e-06
68064 ZUZANA 3 2014 Girl 6 Z 5691 9.782566e-06
68065 ZUZIA 3 2014 Girl 5 Z 5691 9.782566e-06
68066 ZYNAH 3 2014 Girl 5 Z 5691 9.782566e-06
ddply(bnames, c("Length"), summarise, Total=sum(Count))
Length Total
1 1 100
2 2 2301
3 3 147850
4 4 545811
5 5 956263
6 6 871168
7 7 375667
8 8 223549
9 9 111062
10 10 26841
11 11 16136
12 12 5955
13 13 2450
14 14 657
15 15 114
16 16 75
17 17 25
18 18 5
ddply(bnames, c("Length","Year"), summarise, Total=sum(Count))
Length Year Total
1 1 2010 27
2 1 2011 24
3 1 2012 23
4 1 2013 17
5 1 2014 9
6 2 2010 538
7 2 2011 450
8 2 2012 476
9 2 2013 436
10 2 2014 401
11 3 2010 29974
12 3 2011 30391
13 3 2012 30862
14 3 2013 28684
15 3 2014 27939
16 4 2010 112696
17 4 2011 110923
18 4 2012 109841
19 4 2013 105512
20 4 2014 106839
21 5 2010 192883
22 5 2011 197572
23 5 2012 196049
24 5 2013 185769
25 5 2014 183990
26 6 2010 180180
27 6 2011 175887
28 6 2012 176808
29 6 2013 169856
30 6 2014 168437
31 7 2010 77222
32 7 2011 76024
33 7 2012 76744
34 7 2013 73152
35 7 2014 72525
36 8 2010 43733
37 8 2011 44288
38 8 2012 45181
39 8 2013 45243
40 8 2014 45104
41 9 2010 21281
42 9 2011 21898
43 9 2012 23024
44 9 2013 22440
45 9 2014 22419
46 10 2010 4595
47 10 2011 5037
48 10 2012 5793
49 10 2013 5585
50 10 2014 5831
51 11 2010 2766
52 11 2011 3078
53 11 2012 3575
54 11 2013 3376
55 11 2014 3341
56 12 2010 936
57 12 2011 1086
58 12 2012 1390
59 12 2013 1287
60 12 2014 1256
61 13 2010 388
62 13 2011 437
63 13 2012 562
64 13 2013 531
65 13 2014 532
66 14 2010 85
67 14 2011 108
68 14 2012 146
69 14 2013 153
70 14 2014 165
71 15 2010 21
72 15 2011 15
73 15 2012 33
74 15 2013 26
75 15 2014 19
76 16 2010 15
77 16 2011 8
78 16 2012 8
79 16 2013 14
80 16 2014 30
81 17 2011 4
82 17 2012 7
83 17 2013 4
84 17 2014 10
85 18 2014 5
fl <- ddply(bnames, c("Gender","Year","FirstLetter"), summarise, Total=sum(Count))
library(ggplot2)
qplot(Year, Total, data = fl, geom = "line", colour = Gender, facets = ~ FirstLetter)
Class Sex Age Survived Freq
1 1st Male Child No 0
2 2nd Male Child No 0
3 3rd Male Child No 35
4 Crew Male Child No 0
5 1st Female Child No 0
6 2nd Female Child No 0
7 3rd Female Child No 17
8 Crew Female Child No 0
9 1st Male Adult No 118
10 2nd Male Adult No 154
11 3rd Male Adult No 387
12 Crew Male Adult No 670
13 1st Female Adult No 4
14 2nd Female Adult No 13
15 3rd Female Adult No 89
16 Crew Female Adult No 3
17 1st Male Child Yes 5
18 2nd Male Child Yes 11
19 3rd Male Child Yes 13
20 Crew Male Child Yes 0
21 1st Female Child Yes 1
22 2nd Female Child Yes 13
23 3rd Female Child Yes 14
24 Crew Female Child Yes 0
25 1st Male Adult Yes 57
26 2nd Male Adult Yes 14
27 3rd Male Adult Yes 75
28 Crew Male Adult Yes 192
29 1st Female Adult Yes 140
30 2nd Female Adult Yes 80
31 3rd Female Adult Yes 76
32 Crew Female Adult Yes 20
children <- subset(titanic, Age=="Child")
children
Class Sex Age Survived Freq
1 1st Male Child No 0
2 2nd Male Child No 0
3 3rd Male Child No 35
4 Crew Male Child No 0
5 1st Female Child No 0
6 2nd Female Child No 0
7 3rd Female Child No 17
8 Crew Female Child No 0
17 1st Male Child Yes 5
18 2nd Male Child Yes 11
19 3rd Male Child Yes 13
20 Crew Male Child Yes 0
21 1st Female Child Yes 1
22 2nd Female Child Yes 13
23 3rd Female Child Yes 14
24 Crew Female Child Yes 0
survival_rate <- function(df) {
survived = sum(df[df$Survived=="Yes","Freq"])
all_souls = sum(df[,"Freq"])
survived/all_souls
}
survival_rate(children)
[1] 0.5229358
ddply(titanic, .(Sex), "survival_rate")
Sex survival_rate
1 Male 0.2120162
2 Female 0.7319149
ddply(titanic, .(Class), "survival_rate")
Class survival_rate
1 1st 0.6246154
2 2nd 0.4140351
3 3rd 0.2521246
4 Crew 0.2395480
ddply(titanic, .(Age, Sex, Class), "survival_rate")
Age Sex Class survival_rate
1 Child Male 1st 1.00000000
2 Child Male 2nd 1.00000000
3 Child Male 3rd 0.27083333
4 Child Male Crew NaN
5 Child Female 1st 1.00000000
6 Child Female 2nd 1.00000000
7 Child Female 3rd 0.45161290
8 Child Female Crew NaN
9 Adult Male 1st 0.32571429
10 Adult Male 2nd 0.08333333
11 Adult Male 3rd 0.16233766
12 Adult Male Crew 0.22273782
13 Adult Female 1st 0.97222222
14 Adult Female 2nd 0.86021505
15 Adult Female 3rd 0.46060606
16 Adult Female Crew 0.86956522
The plyr
library provides helper functions for:
It assumes the pieces are independent so it can't do: