Ethan Witkowski Spring 2019
## Warning: package 'readxl' was built under R version 3.6.2
#Import Dataset
NYCLUSchools <- read_excel("NYC Regents Test Data.xlsx")
#Remove duplicates
NYCLUSchools <- NYCLUSchools[!(NYCLUSchools$BEDS == 332100011400 | NYCLUSchools$BEDS == 321000011430), ]
#Intervention Variable (Before Treatment, After Treatment)
NYCLUSchools$BTAT <- ifelse(NYCLUSchools$Year<=2015,0,1)
#Variable recording the time elapased since beginning of study
StartYear <- 2007
NYCLUSchools$Timeelapsed <- NYCLUSchools$Year - StartYear
Year - Year in which exam was taken
Exam - Regents Exam Type - Global History/Geography, Living Environment (Biology)
Percent Passing - Percentage of students passing Regents Exam at each school for each year
BEDS - Unique School ID
FSM - Percentage of students who are eligible for free lunch at each school for each year
Total tested - Number of students who took the exam at each school for each year
Demographic - Population of students - All students less special education students
Timeelapased - time elapsed since beginning of study at each school for each year
BTAT - Indicator variable for before treatment, after treatment
#Histogram - Exam passing rate by school 2007-2017
hist(NYCLUSchools$`Percent Passing`,
main = "Exam Passing Rate Distribution by School",
xlab="Exam Passing Rate by School",
ylab="Frequency")
#NPP - Exam passing rate by school 2007-2017
qqnorm(NYCLUSchools$`Percent Passing`)
qqline(NYCLUSchools$`Percent Passing`)
#Scatterplot - Year and Percent Passing
plot(NYCLUSchools$Year, NYCLUSchools$`Percent Passing`,
ylim = c(0,100))
#Scatterplot - Year and Percent Passing (Linearity check)
plot(NYCLUSchools$Year, NYCLUSchools$`Percent Passing`,
ylim = c(-100,200))
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
#Boxplot Percent Passing ~ School
par(mfrow = c(1,1))
boxplot(NYCLUSchools$`Percent Passing` ~ NYCLUSchools$School)
#Panel (School and exam) effects graphic
library(lattice)
xyplot(NYCLUSchools$`Percent Passing` ~ NYCLUSchools$Year | Exam, data = NYCLUSchools, groups = NYCLUSchools$BEDS,
type = "o", panel = panel.superpose)
##
## Welch Two Sample t-test
##
## data: NYCLUSchools$`Percent Passing` by NYCLUSchools$BTAT
## t = 0.54564, df = 203.67, p-value = 0.5859
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.362816 4.170959
## sample estimates:
## mean in group 0 mean in group 1
## 55.81385 54.90977
#Anova School ~ Percent Passing
fit13 <- lm(NYCLUSchools$`Percent Passing` ~ NYCLUSchools$School)
anova(fit13)
## Analysis of Variance Table
##
## Response: NYCLUSchools$`Percent Passing`
## Df Sum Sq Mean Sq F value Pr(>F)
## NYCLUSchools$School 54 147418 2729.97 16.175 < 2.2e-16 ***
## Residuals 728 122871 168.78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Simple linear regression model
plot(NYCLUSchools$Year, NYCLUSchools$`Percent Passing`)
fit1 <- lm(NYCLUSchools$`Percent Passing` ~ NYCLUSchools$Year)
abline(fit1)
##
## Call:
## lm(formula = NYCLUSchools$`Percent Passing` ~ NYCLUSchools$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.704 -14.077 -0.931 12.923 44.296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1178.1055 422.4628 -2.789 0.00542 **
## NYCLUSchools$Year 0.6133 0.2100 2.920 0.00360 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.5 on 781 degrees of freedom
## (71 observations deleted due to missingness)
## Multiple R-squared: 0.0108, Adjusted R-squared: 0.009536
## F-statistic: 8.529 on 1 and 781 DF, p-value: 0.003596
#Loess model
fit2 <- loess(NYCLUSchools$`Percent Passing` ~ NYCLUSchools$Year, span=.5)
plot(NYCLUSchools$Year, NYCLUSchools$`Percent Passing`,
main = "Local Polynomial Regression",
xlab="Time",
ylab="Exam Passing Rate by School")
lines(NYCLUSchools$Year, predict(fit2,newdata = NYCLUSchools$Year), col="red")
#Interrupted Time Series Analysis
#ITSA model
fit3 <- lm(formula = NYCLUSchools$`Percent Passing` ~ NYCLUSchools$Timeelapsed + NYCLUSchools$BTAT + NYCLUSchools$Timeelapsed*NYCLUSchools$BTAT)
summary(fit3)
##
## Call:
## lm(formula = NYCLUSchools$`Percent Passing` ~ NYCLUSchools$Timeelapsed +
## NYCLUSchools$BTAT + NYCLUSchools$Timeelapsed * NYCLUSchools$BTAT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.861 -13.925 -0.861 13.419 46.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.8608 1.3081 38.880 < 2e-16
## NYCLUSchools$Timeelapsed 1.2822 0.2826 4.537 6.61e-06
## NYCLUSchools$BTAT 31.4811 30.3292 1.038 0.300
## NYCLUSchools$Timeelapsed:NYCLUSchools$BTAT -4.1709 3.1989 -1.304 0.193
##
## (Intercept) ***
## NYCLUSchools$Timeelapsed ***
## NYCLUSchools$BTAT
## NYCLUSchools$Timeelapsed:NYCLUSchools$BTAT
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.37 on 779 degrees of freedom
## (71 observations deleted due to missingness)
## Multiple R-squared: 0.02707, Adjusted R-squared: 0.02332
## F-statistic: 7.224 on 3 and 779 DF, p-value: 8.734e-05
plot(NYCLUSchools$Year, NYCLUSchools$`Percent Passing`,
main="Interrupted Time Series",
xlab="Time",
ylab="Exam Passing Rate by School")
clip(0,2018,-20,110)
abline(v=2015.67)
clip(0,2015.67,0,100)
abline(-2770.9453, 1.4056,col="blue", lwd=2)
clip(2015.67,2018,0,100)
abline(-2770.9453 + 8650.9847, 1.4056 - 4.2943, col="red", lwd=2)
clip(2015.67,2018,0,100)
abline(-2770.9453, 1.4056, col="blue", lwd=2, lty=2)
## Warning: package 'sandwich' was built under R version 3.6.3
## Warning: package 'lmtest' was built under R version 3.6.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## t test of coefficients:
##
## Estimate Std. Error t value
## (Intercept) 50.86078 1.30814 38.8804
## NYCLUSchools$Timeelapsed 1.28216 0.28261 4.5369
## NYCLUSchools$BTAT 31.48115 30.32919 1.0380
## NYCLUSchools$Timeelapsed:NYCLUSchools$BTAT -4.17089 3.19893 -1.3038
## Pr(>|t|)
## (Intercept) < 2.2e-16 ***
## NYCLUSchools$Timeelapsed 6.609e-06 ***
## NYCLUSchools$BTAT 0.2996
## NYCLUSchools$Timeelapsed:NYCLUSchools$BTAT 0.1927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning: package 'car' was built under R version 3.6.2
## Loading required package: carData
##
## studentized Breusch-Pagan test
##
## data: fit3
## BP = 4.9051, df = 3, p-value = 0.1789