helloworld | DID HW #2 - Writeup (Reply)

# Problem 1:

> train <- read.csv(url("http://terpconnect.umd.edu/~ying/did/hw2/house_train.csv"))
> test <- read.csv(url("http://terpconnect.umd.edu/~ying/did/hw2/house_test.csv"))

# Problem 2:

> library(glmnet)
> library(reshape)
> library(ggplot2)

> alldata <- rbind(train, test)

> states <- model.matrix(~ state, data=alldata)

# I know all the dimensions already but I suppose one should really use nrow() to be proper
# I do a lot of repeat codes to make this scan more easily, but yes I know I don't have to keep restating what the target vector is :)

> strain <- states[1:8973,]
> stest <- states[8974:10036,]
> target<- as.matrix(train$price2013)

> features <- strain
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)
> cv.lasso <- cv.glmnet(features, target, alpha=1)
> cv.ridge <- cv.glmnet(features, target, alpha=0)

# I choose my model by doing cross-validation, picking the minimum lambda, then taking whichever model gives me the least MSE on my training set. I realize this carries the risk of overfitting, but that's what the next class in the series is for, right?

> pre.lasso <- predict(reg.lasso, newx=strain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.lasso, newx=strain, s = cv.ridge$lambda.min)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 33788837270
[1] 35632857052

# Therefore I'll take the lasso model, with the understanding it may overfit.

# 2a/b/c/d:
# The intercept and coefficients can be gotten by coef():

> coef(reg.lasso, s = cv.lasso$lambda.min)
47 x 1 sparse Matrix of class "dgCMatrix"
                       1
(Intercept)  227149.7841
(Intercept)       .     
stateAL      -85990.9670
stateAR     -104402.0468
stateAZ      -19926.0246
stateCA      285560.7377
stateCO       21489.0154
stateCT       61329.7801
stateDC      286949.4165
stateDE      -19467.4054
stateFL      -51970.3949
stateGA      -84521.1624
stateHI      253084.7251
stateIA      -73700.9678
stateIL      -51776.4020
stateIN      -91595.2483
stateKS      -88381.5515
stateKY      -80808.2027
stateLA      -89117.6414
stateMA      112383.6192
stateMD       54729.5276
stateMI      -83404.7964
stateMN      -20680.2778
stateMO      -73050.2768
stateMT        -737.4277
stateNC      -77347.7183
stateND      -69363.0543
stateNE      -87756.7266
stateNH      -24453.7819
stateNJ       79363.9588
stateNM      -37415.5226
stateNV      -11814.3654
stateNY       71615.8727
stateOH     -109662.8593
stateOK     -117064.2779
stateOR      -12157.7688
statePA      -58426.0884
stateRI       -3851.9908
stateSC      -80878.6359
stateTN      -96024.8720
stateTX      -66391.1226
stateUT       10706.2624
stateVA       72810.2399
stateWA       39233.1109
stateWI      -64147.7649
stateWV     -128334.1829
stateWY      -32036.7326

# The intercept is 227149.7841. It is the baseline value of a house anywhere.
# I got the max/min by eyeballing, but when there are a lot of items (as later), which() and rownames() come in handy. For now ...

> max(coef(reg.lasso, s = cv.lasso$lambda.min)); min(coef(reg.lasso, s = cv.lasso$lambda.min))
[1] 286949.4
[1] -128334.2

# Most expensive is Washington DC (makes sense, hello Beltway Bandits)

# Least is West Virginia (makes less sense in this context, but I know from including poverty in my regression that poverty is a factor, and I know from real life knowledge that WV is very poor, so I would imagine that's the reason.)

# 2e:
# The average (according to the regression) is found by setting all other states to 0 and running the regression.
# First I figure out which state is which:

> strain[1,]
(Intercept)     stateAL     stateAR     stateAZ     stateCA     stateCO     stateCT     stateDC     stateDE 
          1           0           0           0           0           0           0           0           0 
    stateFL     stateGA     stateHI     stateIA     stateIL     stateIN     stateKS     stateKY     stateLA 
          0           0           0           0           0           0           0           0           0 
    stateMA     stateMD     stateMI     stateMN     stateMO     stateMT     stateNC     stateND     stateNE 
          0           0           0           0           0           0           0           0           0 
    stateNH     stateNJ     stateNM     stateNV     stateNY     stateOH     stateOK     stateOR     statePA 
          0           0           0           0           1           0           0           0           0 
    stateRI     stateSC     stateTN     stateTX     stateUT     stateVA     stateWA     stateWI     stateWV 
          0           0           0           0           0           0           0           0           0 
    stateWY 
          0

# DC is the eighth element.

dc <- matrix(0, 1, 46)
> dc[1] <- 1; dc[8] <- 1

> predict(reg.lasso, newx=dc, s = cv.lasso$lambda.min)
            1
[1,] 514099.2

# The predicted average home price in DC is 514099.2.

# WV is the 45th factor

> wv <- matrix(0, 1, 46)
> wv[1] <- 1; wv[45] <- 1
> predict(reg.lasso, newx=wv, s = cv.lasso$lambda.min)
           1
[1,] 98815.6

# The predicted average home price in WV is 98815.6. (I should buy a home there ...)

# Other way to do both is to manually calculate using y = mx + b.

DC: > 227149.7841 + 286949.4
[1] 514099.2
WV: > 227149.7841 -128334.2
[1] 98815.58

# Same results, which is good for the sanity.

# Problem 3:

# This time I regress using state & county. The alldata and train objects are from earlier and remain the same.

> sc <- model.matrix(~ state + county, data=alldata)
> sctrain <- sc[1:8973,]
> features <- sctrain
> target<- as.matrix(train$price2013)
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)

# The cross-validation sucks a lot more this time ... makes sense, four hundred or so counties. Next time I'll use a sparse matrix. Spoiler: it worked great.

> cv.lasso <- cv.glmnet(features, target, alpha=1); cv.ridge <- cv.glmnet(features, target, alpha=0)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> pre.lasso <- predict(reg.lasso, newx=sctrain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.ridge, newx=sctrain, s = cv.ridge$lambda.min)
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 21425989697
[1] 21528741448

# Again, I will take lasso.

> coefs <- coef(reg.lasso, s = cv.lasso$lambda.min)
> which(coefs==max(coefs))
> rownames(coefs)[487]
[1] "countypitkin"
> which(coefs==min(coefs))
[1] 118
> rownames(coefs)[118]
[1] "countycalaveras"

# The county with the highest regression coef is Pitkin, which is apparently in Aspen, CO, where rich snow bunnies hang out. This makes sense.

# Calaveras is in California, which frog or no frog doesn't seem to be doing that well. I can't get any news about it, but I would assume it's a poor community that had some sort of huge bust. Or, my model could be wrong, overfitted, who knows.

# Problem 4:

# This time I include everything seemingly relevant. Sparse matrix this time, which made things run so much faster!

> all <- sparse.model.matrix(~ state + county + poverty + price2007, data=alldata)
> dim(all)
[1] 10036   681
> alltrain <- all[1:8973,]
> alltest <- all[8974:10036,]
> dim(alltrain)
[1] 8973  681
> dim(alltest)
[1] 1063  681
> features <- alltrain
> target <- as.matrix(train$price2013)
> dim(target)
[1] 8973    1
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)
> cv.lasso <- cv.glmnet(features, target, alpha=1)
> cv.ridge <- cv.glmnet(features, target, alpha=0)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> pre.lasso <- predict(reg.lasso, newx=alltrain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.ridge, newx=alltrain, s = cv.ridge$lambda.min)
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 1784361138
[1] 2628787024

# Lasso again.

# THIS TIME use the test data, and let it go.

> pre.lasso <- predict(reg.lasso, newx=alltest, s = cv.lasso$lambda.min)
> results <- cbind(test$id, pre.lasso)
> colnames(results) <- c("id", "price2013")
> write.csv(results, file="iy_predictions.csv", row.names=FALSE)

Submitted the results to Kaggle, after manually removing the quotes from the header of the .csv file.