DID HW #3 1-3
Mar. 18th, 2014 02:01 amWow ... this one was rough.
Deals with classification using logistic regression, SVMs, and decision trees.
Anyway, code to get through #1-3 below cut. All in R. I also did a script in Python but it turned out to be useless.
#4 is just to run this on the testing set provided, so I'm not going to belabor that here.
And the results. The best was the 1:9 partition, as below.
2. The code to remove most punctuation from the
But as I said, since the answer generation algorithm didn't have this problem, it turned out to be a totally useless feature.
Deals with classification using logistic regression, SVMs, and decision trees.
Anyway, code to get through #1-3 below cut. All in R. I also did a script in Python but it turned out to be useless.
#4 is just to run this on the testing set provided, so I'm not going to belabor that here.
library(e1071)
library(rpart)
qbtrain <- read.csv("D:/Documents/workspace/qb.train.csv", as.is=TRUE)
qbtest <- read.csv("D:/Documents/workspace/qb.test.csv", as.is=TRUE)
# tried adding a feature factor for whether page title is in text
# qbtrain$newtext <- tolower(qbtrain$text)
# qbtrain$newpage <- tolower(qbtrain$page)
# but everything is false so commenting it out, just showing my work
# did the text cleanup in python
# qbtrain$intext <- grepl(qbtrain$page[[1]], qbtrain$newtext)
# qbtrain$intext <- factor(qbtrain$intext)
# other useless features: indiv. tournaments, category (all lit!)
#add features for words, and being part of specific tournaments
wc <- apply(qbtrain, 1, function(x) {length(strsplit(x['text'], " ")[[1]])})
qbtrain$wordcount <- wc
qbtrain$isacf <- grepl("ACF", qbtrain$tournaments)
qbtrain$isbuzzer <- grepl("Buzzerfest", qbtrain$tournaments)
qbtrain$isopen <- grepl("Open", qbtrain$tournaments)
qbtrain$isacf <- factor(qbtrain$isacf)
qbtrain$isbuzzer <- factor(qbtrain$isbuzzer)
qbtrain$isopen <- factor(qbtrain$isopen)
qbtrain$corr <- factor(qbtrain$corr)
# split into half
trainingindex <- 1:nrow(qbtrain)
selection <- round(max(trainingindex)/9, digits = 0)
testindex <- sample(trainingindex, size=selection)
trainset <- qbtrain[-testindex,]
testset <- qbtrain[testindex,]
test.b <- data.frame(body_score=testset$body_score)
test.bi <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks)
test.biw <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks, wordcount=testset$wordcount)
test.biwt <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks, wordcount=testset$wordcount, isacf=testset$isacf, isbuzzer=testset$isbuzzer, isopen=testset$isopen)
# Baseline:
baseline = sum(testset$corr == "False") / length(testset$corr)
# Set up the answers:
realanswers <- as.logical(testset$corr)
results <- data.frame(model=c("MFC"), classifier=c("None"), score=baseline)
# Calculates score for each logit model
l1 <- glm(corr ~ body_score, data=trainset, family="binomial")
l2 <- glm(corr ~ body_score + inlinks, data=trainset, family="binomial")
l3 <- glm(corr ~ body_score + inlinks + wordcount, data=trainset, family="binomial")
l4 <- glm(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, data=trainset, family="binomial")
score.logit <- function(df, name, model, data) {
logit.pred <- predict.glm(model, newdata=data)
guesses <- logit.pred > 0
matching <- guesses == realanswers
score <- sum(matching == TRUE) / length(matching)
df <- rbind(df, data.frame(model=c(name), classifier=c("Logit"), score))
return(df)
}
results <- score.logit(results, "body_score", l1, test.b)
results <- score.logit(results, "body_score, inlinks", l2, test.bi)
results <- score.logit(results, "body_score, inlinks, wordcount", l3, test.biw)
results <- score.logit(results, "body_score, inlinks, wordcount, ACF/Buzzerfest/Open,", l4, test.biwt)
# Calculates score for each SVM model
score.svm <- function(df, name, model, data) {
svm.pred <- predict(model, newdata=data)
svm.pred <- as.logical(svm.pred)
matching <- svm.pred == realanswers
score <- sum(matching == TRUE) / length(matching)
df <- rbind(df, data.frame(model=c(name), classifier=c("SVM"), score))
return(df)
}
s1 <- svm(corr ~ body_score, data=trainset)
s2 <- svm(corr ~ body_score + inlinks, data=trainset)
s3 <- svm(corr ~ body_score + inlinks + wordcount, data=trainset)
s4 <- svm(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, data=trainset)
results <- score.svm(results, "body_score", s1, test.b)
results <- score.svm(results, "body_score, inlinks", s2, test.bi)
results <- score.svm(results, "body_score, inlinks, wordcount", s3, test.biw)
results <- score.svm(results, "body_score, inlinks, wordcount, tournaments", s4, test.biwt)
# Calculates score for each tree model
score.tree <- function(df, name, model, data) {
tree.pred <- predict(model, newdata=data, type="class")
guesses <- as.character(tree.pred)
guesses <- as.logical(guesses)
matching <- guesses == realanswers
score <- sum(matching == TRUE) / length(matching)
df <- rbind(df, data.frame(model=c(name), classifier=c("Decision Tree"), score))
return(df)
}
t1 <- rpart(corr ~ body_score, method="class", data=trainset)
t2 <- rpart(corr ~ body_score + inlinks, method="class", data=trainset)
t3 <- rpart(corr ~ body_score + inlinks + wordcount, method="class", data=trainset)
t4 <- rpart(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, method="class", data=trainset)
results <- score.tree(results, "body_score", t1, test.b)
results <- score.tree(results, "body_score, inlinks", t2, test.bi)
results <- score.tree(results, "body_score, inlinks, wordcount", t3, test.biw)
results <- score.tree(results, "body_score, inlinks, wordcount, tournaments", t4, test.biwt)
write.table(results, file="HW3.txt", row.names=FALSE)
# after looking at a couple of iterations, I decide that SVM using inlinks, body_score, and wordcount is aces
# either tournaments don't help, or it needs more refining?
# decide to go with the 1:9 partition as it seems to work the best, also, the real testing set is about 1/9 the size of training set
# so here we go, this is for kaggle gold
wc <- apply(qbtest, 1, function(x) {length(strsplit(x['text'], " ")[[1]])}) # need to recalc wordcount!
actualtesting <- data.frame(body_score=qbtest$body_score, inlinks=qbtest$inlinks, wordcount=qbtest$wordcount)
# To make the graph:
library(ggplot2)
sp <- ggplot(qbtrain, aes(x=wordcount)) + geom_histogram(binwidth=5)
sp + facet_wrap( ~ corr, ncol = 2) + ggtitle("Wordplot Distribution vs T/F")And the results. The best was the 1:9 partition, as below.
"model" "classifier" "score"
"MFC" "None" 0.560133630289532
"body_score" "Logit" 0.670378619153675
"body_score, inlinks" "Logit" 0.679287305122494
"body_score, inlinks, wordcount" "Logit" 0.710467706013363
"body_score, inlinks, wordcount, ACF/Buzzerfest/Open," "Logit" 0.714922048997773
"body_score" "SVM" 0.7728285077951
"body_score, inlinks" "SVM" 0.776169265033408
"body_score, inlinks, wordcount" "SVM" 0.814031180400891
"body_score, inlinks, wordcount, tournaments" "SVM" 0.799554565701559
"body_score" "Decision Tree" 0.762806236080178
"body_score, inlinks" "Decision Tree" 0.779510022271715
"body_score, inlinks, wordcount" "Decision Tree" 0.779510022271715
"body_score, inlinks, wordcount, tournaments" "Decision Tree" 0.7795100222717152. The code to remove most punctuation from the
text field was done in Python.def cleanup():
""" Given a file of text strings, writes a new file with all non-alphabet
characters and tabs removed. In this case I have specified the file.
"""
cleaned_file = []
input_file = 'text.txt'
reading = open(input_file, 'r')
for line in reading:
newline = ' '.join(line.split())
for char in newline:
if char in '"?./;:(),\!"':
newline = newline.replace(char,'')
else:
continue
cleaned_file.append(newline)
reading.close()
with open('cleaned.txt', mode='wt') as myfile:
myfile.write('\n'.join(cleaned_file))But as I said, since the answer generation algorithm didn't have this problem, it turned out to be a totally useless feature.