-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathChap4.pythagoras.R
131 lines (95 loc) · 3.84 KB
/
Chap4.pythagoras.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#################################################
# Chapter 4 - The Relation Between Runs and Wins
#
# Needs .csv files from the Lahman's database
# (placed in the "lahman" subfolder)
#
#################################################
# Section 4.2 The Teams Table in Lahman's Database
getwd()
teams <- read.csv("lahman/teams.csv")
tail(teams)
myteams <- subset(teams, yearID > 2000)[ , c("teamID", "yearID",
"lgID", "G", "W", "L", "R", "RA")]
tail(myteams)
myteams$RD <- with(myteams, R - RA)
myteams$Wpct <- with(myteams, W / (W + L))
plot(myteams$RD, myteams$Wpct,
xlab="run differential",
ylab="winning percentage")
# Section 4.3 Linear Regression
linfit <- lm(Wpct ~ RD, data=myteams)
abline(a=coef(linfit)[1], b=coef(linfit)[2], lwd=2)
myteams$linWpct <- predict(linfit)
myteams$linResiduals <- residuals(linfit)
plot(myteams$RD, myteams$linResiduals,
xlab="run differential",
ylab="residual")
abline(h=0, lty=3)
points(c(68, 88), c(.0749, -.0733), pch=19)
text(68, .0749, "LAA '08", pos=4, cex=.8)
text(88, -.0733, "CLE '06", pos=4, cex=.8)
mean(myteams$linResiduals)
linRMSE <- sqrt(mean(myteams$linResiduals ^ 2))
linRMSE
nrow(subset(myteams, abs(linResiduals) < linRMSE)) /
nrow(myteams)
nrow(subset(myteams, abs(linResiduals) < 2 * linRMSE)) /
nrow(myteams)
# Section 4.4 The Pythagorean Formula for Winning Percentage
myteams$pytWpct <- with(myteams, R ^ 2 / (R ^ 2 + RA ^ 2))
myteams$pytResiduals <- myteams$Wpct - myteams$pytWpct
sqrt(mean(myteams$pytResiduals ^ 2))
# Section 4.5 The Exponent in the Pythagorean Formula
myteams$logWratio <- log(myteams$W / myteams$L)
myteams$logRratio <- log(myteams$R / myteams$RA)
pytFit <- lm(logWratio ~ 0 + logRratio, data=myteams)
pytFit
# Section 4.6 Good and Bad Predictions by the Pythagorean Formula
gl2011 <- read.table("gl2011.txt", sep=",")
glheaders <- read.csv("game_log_header.csv")
names(gl2011) <- names(glheaders)
BOS2011 <- subset(gl2011, HomeTeam=="BOS" | VisitingTeam=="BOS")[
, c("VisitingTeam", "HomeTeam", "VisitorRunsScored",
"HomeRunsScore")]
head(BOS2011)
BOS2011$ScoreDiff <- with(BOS2011, ifelse(HomeTeam == "BOS",
HomeRunsScore - VisitorRunsScored,
VisitorRunsScored - HomeRunsScore))
BOS2011$W <- BOS2011$ScoreDiff > 0
aggregate(abs(BOS2011$ScoreDiff), list(W=BOS2011$W), summary)
results <- gl2011[,c("VisitingTeam", "HomeTeam",
"VisitorRunsScored", "HomeRunsScore")]
results$winner <- ifelse(results$HomeRunsScore >
results$VisitorRunsScored, as.character(results$HomeTeam),
as.character(results$VisitingTeam))
results$diff <- abs(results$VisitorRunsScored -
results$HomeRunsScore)
onerungames <- subset(results, diff == 1)
onerunwins <- as.data.frame(table(onerungames$winner))
names(onerunwins) <- c("teamID", "onerunW")
teams2011 <- subset(myteams, yearID == 2011)
teams2011[teams2011$teamID == "LAA", "teamID"] <- "ANA"
teams2011 <- merge(teams2011, onerunwins)
plot(teams2011$onerunW, teams2011$pytResiduals,
xlab="one run wins",
ylab="Pythagorean residuals")
identify(teams2011$onerunW, teams2011$pytResiduals,
labels=teams2011$teamID)
#...identify data points by mouse-clickin on the plot
#...then press ESC to finish
pit <- read.csv("lahman/pitching.csv")
top_closers <- subset(pit, GF > 50 & ERA < 2.5)[ ,c("playerID",
"yearID", "teamID")]
teams_top_closers <- merge(myteams, top_closers)
summary(teams_top_closers$pytResiduals)
# Section 4.7 How Many Runs for a Win?
D(expression(G * R ^ 2 / (R ^ 2 + RA ^ 2)), "R")
IR <- function(RS=5, RA=5){
round((RS ^ 2 + RA ^ 2)^2 / (2 * RS * RA ^ 2), 1)
}
IRtable <- expand.grid(RS=seq(3, 6, .5), RA=seq(3, 6, .5))
rbind(head(IRtable), tail(IRtable))
IRtable$IRW <- IR(IRtable$RS, IRtable$RA)
xtabs(IRW ~ RS + RA, data=IRtable)
################################################