DID HW #2 - Writeup
Mar. 5th, 2014 10:21 pm# Problem 1:
> train <- read.csv(url("http://terpconnect.umd.edu/~ying/did/hw2/house_train.csv"))
> test <- read.csv(url("http://terpconnect.umd.edu/~ying/did/hw2/house_test.csv"))# Problem 2:
> library(glmnet)
> library(reshape)
> library(ggplot2)
> alldata <- rbind(train, test)
> states <- model.matrix(~ state, data=alldata)# I know all the dimensions already but I suppose one should really use nrow() to be proper
# I do a lot of repeat codes to make this scan more easily, but yes I know I don't have to keep restating what the target vector is :)
> strain <- states[1:8973,]
> stest <- states[8974:10036,]
> target<- as.matrix(train$price2013)
> features <- strain
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)
> cv.lasso <- cv.glmnet(features, target, alpha=1)
> cv.ridge <- cv.glmnet(features, target, alpha=0)# I choose my model by doing cross-validation, picking the minimum lambda, then taking whichever model gives me the least MSE on my training set. I realize this carries the risk of overfitting, but that's what the next class in the series is for, right?
> pre.lasso <- predict(reg.lasso, newx=strain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.lasso, newx=strain, s = cv.ridge$lambda.min)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 33788837270
[1] 35632857052# Therefore I'll take the lasso model, with the understanding it may overfit.
# 2a/b/c/d:
# The intercept and coefficients can be gotten by coef():
> coef(reg.lasso, s = cv.lasso$lambda.min)
47 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 227149.7841
(Intercept) .
stateAL -85990.9670
stateAR -104402.0468
stateAZ -19926.0246
stateCA 285560.7377
stateCO 21489.0154
stateCT 61329.7801
stateDC 286949.4165
stateDE -19467.4054
stateFL -51970.3949
stateGA -84521.1624
stateHI 253084.7251
stateIA -73700.9678
stateIL -51776.4020
stateIN -91595.2483
stateKS -88381.5515
stateKY -80808.2027
stateLA -89117.6414
stateMA 112383.6192
stateMD 54729.5276
stateMI -83404.7964
stateMN -20680.2778
stateMO -73050.2768
stateMT -737.4277
stateNC -77347.7183
stateND -69363.0543
stateNE -87756.7266
stateNH -24453.7819
stateNJ 79363.9588
stateNM -37415.5226
stateNV -11814.3654
stateNY 71615.8727
stateOH -109662.8593
stateOK -117064.2779
stateOR -12157.7688
statePA -58426.0884
stateRI -3851.9908
stateSC -80878.6359
stateTN -96024.8720
stateTX -66391.1226
stateUT 10706.2624
stateVA 72810.2399
stateWA 39233.1109
stateWI -64147.7649
stateWV -128334.1829
stateWY -32036.7326# The intercept is 227149.7841. It is the baseline value of a house anywhere.
# I got the max/min by eyeballing, but when there are a lot of items (as later), which() and rownames() come in handy. For now ...
> max(coef(reg.lasso, s = cv.lasso$lambda.min)); min(coef(reg.lasso, s = cv.lasso$lambda.min))
[1] 286949.4
[1] -128334.2# Most expensive is Washington DC (makes sense, hello Beltway Bandits)
# Least is West Virginia (makes less sense in this context, but I know from including poverty in my regression that poverty is a factor, and I know from real life knowledge that WV is very poor, so I would imagine that's the reason.)
# 2e:
# The average (according to the regression) is found by setting all other states to 0 and running the regression.
# First I figure out which state is which:
> strain[1,]
(Intercept) stateAL stateAR stateAZ stateCA stateCO stateCT stateDC stateDE
1 0 0 0 0 0 0 0 0
stateFL stateGA stateHI stateIA stateIL stateIN stateKS stateKY stateLA
0 0 0 0 0 0 0 0 0
stateMA stateMD stateMI stateMN stateMO stateMT stateNC stateND stateNE
0 0 0 0 0 0 0 0 0
stateNH stateNJ stateNM stateNV stateNY stateOH stateOK stateOR statePA
0 0 0 0 1 0 0 0 0
stateRI stateSC stateTN stateTX stateUT stateVA stateWA stateWI stateWV
0 0 0 0 0 0 0 0 0
stateWY
0 # DC is the eighth element.
dc <- matrix(0, 1, 46)
> dc[1] <- 1; dc[8] <- 1
> predict(reg.lasso, newx=dc, s = cv.lasso$lambda.min)
1
[1,] 514099.2# The predicted average home price in DC is 514099.2.
# WV is the 45th factor
> wv <- matrix(0, 1, 46)
> wv[1] <- 1; wv[45] <- 1
> predict(reg.lasso, newx=wv, s = cv.lasso$lambda.min)
1
[1,] 98815.6# The predicted average home price in WV is 98815.6. (I should buy a home there ...)
# Other way to do both is to manually calculate using y = mx + b.
DC: > 227149.7841 + 286949.4
[1] 514099.2
WV: > 227149.7841 -128334.2
[1] 98815.58# Same results, which is good for the sanity.
# Problem 3:
# This time I regress using state & county. The alldata and train objects are from earlier and remain the same.
> sc <- model.matrix(~ state + county, data=alldata)
> sctrain <- sc[1:8973,]
> features <- sctrain
> target<- as.matrix(train$price2013)
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)# The cross-validation sucks a lot more this time ... makes sense, four hundred or so counties. Next time I'll use a sparse matrix. Spoiler: it worked great.
> cv.lasso <- cv.glmnet(features, target, alpha=1); cv.ridge <- cv.glmnet(features, target, alpha=0)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> pre.lasso <- predict(reg.lasso, newx=sctrain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.ridge, newx=sctrain, s = cv.ridge$lambda.min)
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 21425989697
[1] 21528741448# Again, I will take lasso.
> coefs <- coef(reg.lasso, s = cv.lasso$lambda.min)
> which(coefs==max(coefs))
> rownames(coefs)[487]
[1] "countypitkin"
> which(coefs==min(coefs))
[1] 118
> rownames(coefs)[118]
[1] "countycalaveras"# The county with the highest regression coef is Pitkin, which is apparently in Aspen, CO, where rich snow bunnies hang out. This makes sense.
# Calaveras is in California, which frog or no frog doesn't seem to be doing that well. I can't get any news about it, but I would assume it's a poor community that had some sort of huge bust. Or, my model could be wrong, overfitted, who knows.
# Problem 4:
# This time I include everything seemingly relevant. Sparse matrix this time, which made things run so much faster!
> all <- sparse.model.matrix(~ state + county + poverty + price2007, data=alldata)
> dim(all)
[1] 10036 681
> alltrain <- all[1:8973,]
> alltest <- all[8974:10036,]
> dim(alltrain)
[1] 8973 681
> dim(alltest)
[1] 1063 681
> features <- alltrain
> target <- as.matrix(train$price2013)
> dim(target)
[1] 8973 1
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)
> cv.lasso <- cv.glmnet(features, target, alpha=1)
> cv.ridge <- cv.glmnet(features, target, alpha=0)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> pre.lasso <- predict(reg.lasso, newx=alltrain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.ridge, newx=alltrain, s = cv.ridge$lambda.min)
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 1784361138
[1] 2628787024# Lasso again.
# THIS TIME use the test data, and let it go.
> pre.lasso <- predict(reg.lasso, newx=alltest, s = cv.lasso$lambda.min)
> results <- cbind(test$id, pre.lasso)
> colnames(results) <- c("id", "price2013")
> write.csv(results, file="iy_predictions.csv", row.names=FALSE)Submitted the results to Kaggle, after manually removing the quotes from the header of the .csv file.