helloworld: (Default)
[personal profile] helloworld

# Problem 1:

> train <- read.csv(url("http://terpconnect.umd.edu/~ying/did/hw2/house_train.csv"))
> test <- read.csv(url("http://terpconnect.umd.edu/~ying/did/hw2/house_test.csv"))


# Problem 2:

> library(glmnet)
> library(reshape)
> library(ggplot2)

> alldata <- rbind(train, test)

> states <- model.matrix(~ state, data=alldata)


# I know all the dimensions already but I suppose one should really use nrow() to be proper
# I do a lot of repeat codes to make this scan more easily, but yes I know I don't have to keep restating what the target vector is :)

> strain <- states[1:8973,]
> stest <- states[8974:10036,]
> target<- as.matrix(train$price2013)

> features <- strain
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)
> cv.lasso <- cv.glmnet(features, target, alpha=1)
> cv.ridge <- cv.glmnet(features, target, alpha=0)


# I choose my model by doing cross-validation, picking the minimum lambda, then taking whichever model gives me the least MSE on my training set. I realize this carries the risk of overfitting, but that's what the next class in the series is for, right?

> pre.lasso <- predict(reg.lasso, newx=strain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.lasso, newx=strain, s = cv.ridge$lambda.min)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 33788837270
[1] 35632857052


# Therefore I'll take the lasso model, with the understanding it may overfit.

# 2a/b/c/d:
# The intercept and coefficients can be gotten by coef():

> coef(reg.lasso, s = cv.lasso$lambda.min)
47 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 227149.7841
(Intercept) .
stateAL -85990.9670
stateAR -104402.0468
stateAZ -19926.0246
stateCA 285560.7377
stateCO 21489.0154
stateCT 61329.7801
stateDC 286949.4165
stateDE -19467.4054
stateFL -51970.3949
stateGA -84521.1624
stateHI 253084.7251
stateIA -73700.9678
stateIL -51776.4020
stateIN -91595.2483
stateKS -88381.5515
stateKY -80808.2027
stateLA -89117.6414
stateMA 112383.6192
stateMD 54729.5276
stateMI -83404.7964
stateMN -20680.2778
stateMO -73050.2768
stateMT -737.4277
stateNC -77347.7183
stateND -69363.0543
stateNE -87756.7266
stateNH -24453.7819
stateNJ 79363.9588
stateNM -37415.5226
stateNV -11814.3654
stateNY 71615.8727
stateOH -109662.8593
stateOK -117064.2779
stateOR -12157.7688
statePA -58426.0884
stateRI -3851.9908
stateSC -80878.6359
stateTN -96024.8720
stateTX -66391.1226
stateUT 10706.2624
stateVA 72810.2399
stateWA 39233.1109
stateWI -64147.7649
stateWV -128334.1829
stateWY -32036.7326


# The intercept is 227149.7841. It is the baseline value of a house anywhere.
# I got the max/min by eyeballing, but when there are a lot of items (as later), which() and rownames() come in handy. For now ...

> max(coef(reg.lasso, s = cv.lasso$lambda.min)); min(coef(reg.lasso, s = cv.lasso$lambda.min))
[1] 286949.4
[1] -128334.2


# Most expensive is Washington DC (makes sense, hello Beltway Bandits)

# Least is West Virginia (makes less sense in this context, but I know from including poverty in my regression that poverty is a factor, and I know from real life knowledge that WV is very poor, so I would imagine that's the reason.)

# 2e:
# The average (according to the regression) is found by setting all other states to 0 and running the regression.
# First I figure out which state is which:

> strain[1,]
(Intercept) stateAL stateAR stateAZ stateCA stateCO stateCT stateDC stateDE
1 0 0 0 0 0 0 0 0
stateFL stateGA stateHI stateIA stateIL stateIN stateKS stateKY stateLA
0 0 0 0 0 0 0 0 0
stateMA stateMD stateMI stateMN stateMO stateMT stateNC stateND stateNE
0 0 0 0 0 0 0 0 0
stateNH stateNJ stateNM stateNV stateNY stateOH stateOK stateOR statePA
0 0 0 0 1 0 0 0 0
stateRI stateSC stateTN stateTX stateUT stateVA stateWA stateWI stateWV
0 0 0 0 0 0 0 0 0
stateWY
0


# DC is the eighth element.

dc <- matrix(0, 1, 46)
> dc[1] <- 1; dc[8] <- 1

> predict(reg.lasso, newx=dc, s = cv.lasso$lambda.min)
1
[1,] 514099.2


# The predicted average home price in DC is 514099.2.

# WV is the 45th factor

> wv <- matrix(0, 1, 46)
> wv[1] <- 1; wv[45] <- 1
> predict(reg.lasso, newx=wv, s = cv.lasso$lambda.min)
1
[1,] 98815.6


# The predicted average home price in WV is 98815.6. (I should buy a home there ...)

# Other way to do both is to manually calculate using y = mx + b.

DC: > 227149.7841 + 286949.4
[1] 514099.2
WV: > 227149.7841 -128334.2
[1] 98815.58


# Same results, which is good for the sanity.

# Problem 3:

# This time I regress using state & county. The alldata and train objects are from earlier and remain the same.

> sc <- model.matrix(~ state + county, data=alldata)
> sctrain <- sc[1:8973,]
> features <- sctrain
> target<- as.matrix(train$price2013)
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)


# The cross-validation sucks a lot more this time ... makes sense, four hundred or so counties. Next time I'll use a sparse matrix. Spoiler: it worked great.

> cv.lasso <- cv.glmnet(features, target, alpha=1); cv.ridge <- cv.glmnet(features, target, alpha=0)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> pre.lasso <- predict(reg.lasso, newx=sctrain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.ridge, newx=sctrain, s = cv.ridge$lambda.min)
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 21425989697
[1] 21528741448


# Again, I will take lasso.

> coefs <- coef(reg.lasso, s = cv.lasso$lambda.min)
> which(coefs==max(coefs))
> rownames(coefs)[487]
[1] "countypitkin"
> which(coefs==min(coefs))
[1] 118
> rownames(coefs)[118]
[1] "countycalaveras"


# The county with the highest regression coef is Pitkin, which is apparently in Aspen, CO, where rich snow bunnies hang out. This makes sense.

# Calaveras is in California, which frog or no frog doesn't seem to be doing that well. I can't get any news about it, but I would assume it's a poor community that had some sort of huge bust. Or, my model could be wrong, overfitted, who knows.

# Problem 4:

# This time I include everything seemingly relevant. Sparse matrix this time, which made things run so much faster!

> all <- sparse.model.matrix(~ state + county + poverty + price2007, data=alldata)
> dim(all)
[1] 10036 681
> alltrain <- all[1:8973,]
> alltest <- all[8974:10036,]
> dim(alltrain)
[1] 8973 681
> dim(alltest)
[1] 1063 681
> features <- alltrain
> target <- as.matrix(train$price2013)
> dim(target)
[1] 8973 1
> reg.lasso <- glmnet(features, target, alpha=1)
> reg.ridge <- glmnet(features, target, alpha=0)
> cv.lasso <- cv.glmnet(features, target, alpha=1)
> cv.ridge <- cv.glmnet(features, target, alpha=0)
> obs.lasso <- train$price2013
> obs.ridge <- train$price2013
> pre.lasso <- predict(reg.lasso, newx=alltrain, s = cv.lasso$lambda.min)
> pre.ridge <- predict(reg.ridge, newx=alltrain, s = cv.ridge$lambda.min)
> mse.lasso <- mse(obs.lasso, pre.lasso)
> mse.ridge <- mse(obs.ridge, pre.ridge)
> mse.lasso; mse.ridge
[1] 1784361138
[1] 2628787024


# Lasso again.

# THIS TIME use the test data, and let it go.

> pre.lasso <- predict(reg.lasso, newx=alltest, s = cv.lasso$lambda.min)
> results <- cbind(test$id, pre.lasso)
> colnames(results) <- c("id", "price2013")
> write.csv(results, file="iy_predictions.csv", row.names=FALSE)


Submitted the results to Kaggle, after manually removing the quotes from the header of the .csv file.
This account has disabled anonymous posting.
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

Profile

helloworld: (Default)
apply(myLife, fuck)

October 2015

S M T W T F S
    123
4 5678910
11121314151617
18192021222324
25262728293031

Most Popular Tags

Style Credit

Expand Cut Tags

No cut tags