helloworld: (Default)
[personal profile] helloworld
Wow ... this one was rough.

Deals with classification using logistic regression, SVMs, and decision trees.

Anyway, code to get through #1-3 below cut. All in R. I also did a script in Python but it turned out to be useless.

#4 is just to run this on the testing set provided, so I'm not going to belabor that here.



library(e1071)
library(rpart)
qbtrain <- read.csv("D:/Documents/workspace/qb.train.csv", as.is=TRUE)
qbtest <- read.csv("D:/Documents/workspace/qb.test.csv", as.is=TRUE)

# tried adding a feature factor for whether page title is in text
# qbtrain$newtext <- tolower(qbtrain$text)
# qbtrain$newpage <- tolower(qbtrain$page)
# but everything is false so commenting it out, just showing my work
# did the text cleanup in python
# qbtrain$intext <- grepl(qbtrain$page[[1]], qbtrain$newtext)
# qbtrain$intext <- factor(qbtrain$intext)
# other useless features: indiv. tournaments, category (all lit!)


#add features for words, and being part of specific tournaments
wc <- apply(qbtrain, 1, function(x) {length(strsplit(x['text'], " ")[[1]])})
qbtrain$wordcount <- wc
qbtrain$isacf <- grepl("ACF", qbtrain$tournaments)
qbtrain$isbuzzer <- grepl("Buzzerfest", qbtrain$tournaments)
qbtrain$isopen <- grepl("Open", qbtrain$tournaments)
qbtrain$isacf <- factor(qbtrain$isacf)
qbtrain$isbuzzer <- factor(qbtrain$isbuzzer)
qbtrain$isopen <- factor(qbtrain$isopen)
qbtrain$corr <- factor(qbtrain$corr)

# split into half
trainingindex <- 1:nrow(qbtrain)
selection <- round(max(trainingindex)/9, digits = 0)
testindex <- sample(trainingindex, size=selection)
trainset <- qbtrain[-testindex,]
testset <- qbtrain[testindex,]

test.b <- data.frame(body_score=testset$body_score)
test.bi <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks)
test.biw <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks, wordcount=testset$wordcount)
test.biwt <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks, wordcount=testset$wordcount, isacf=testset$isacf, isbuzzer=testset$isbuzzer, isopen=testset$isopen)

# Baseline:
baseline = sum(testset$corr == "False") / length(testset$corr)

# Set up the answers:
realanswers <- as.logical(testset$corr)
results <- data.frame(model=c("MFC"), classifier=c("None"), score=baseline)

# Calculates score for each logit model
l1 <- glm(corr ~ body_score, data=trainset, family="binomial")
l2 <- glm(corr ~ body_score + inlinks, data=trainset, family="binomial")
l3 <- glm(corr ~ body_score + inlinks + wordcount, data=trainset, family="binomial")
l4 <- glm(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, data=trainset, family="binomial")

score.logit <- function(df, name, model, data) {
logit.pred <- predict.glm(model, newdata=data)
guesses <- logit.pred > 0
matching <- guesses == realanswers
score <- sum(matching == TRUE) / length(matching)
df <- rbind(df, data.frame(model=c(name), classifier=c("Logit"), score))
return(df)
}

results <- score.logit(results, "body_score", l1, test.b)
results <- score.logit(results, "body_score, inlinks", l2, test.bi)
results <- score.logit(results, "body_score, inlinks, wordcount", l3, test.biw)
results <- score.logit(results, "body_score, inlinks, wordcount, ACF/Buzzerfest/Open,", l4, test.biwt)

# Calculates score for each SVM model
score.svm <- function(df, name, model, data) {
svm.pred <- predict(model, newdata=data)
svm.pred <- as.logical(svm.pred)
matching <- svm.pred == realanswers
score <- sum(matching == TRUE) / length(matching)
df <- rbind(df, data.frame(model=c(name), classifier=c("SVM"), score))
return(df)
}

s1 <- svm(corr ~ body_score, data=trainset)
s2 <- svm(corr ~ body_score + inlinks, data=trainset)
s3 <- svm(corr ~ body_score + inlinks + wordcount, data=trainset)
s4 <- svm(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, data=trainset)

results <- score.svm(results, "body_score", s1, test.b)
results <- score.svm(results, "body_score, inlinks", s2, test.bi)
results <- score.svm(results, "body_score, inlinks, wordcount", s3, test.biw)
results <- score.svm(results, "body_score, inlinks, wordcount, tournaments", s4, test.biwt)

# Calculates score for each tree model
score.tree <- function(df, name, model, data) {
tree.pred <- predict(model, newdata=data, type="class")
guesses <- as.character(tree.pred)
guesses <- as.logical(guesses)
matching <- guesses == realanswers
score <- sum(matching == TRUE) / length(matching)
df <- rbind(df, data.frame(model=c(name), classifier=c("Decision Tree"), score))
return(df)
}

t1 <- rpart(corr ~ body_score, method="class", data=trainset)
t2 <- rpart(corr ~ body_score + inlinks, method="class", data=trainset)
t3 <- rpart(corr ~ body_score + inlinks + wordcount, method="class", data=trainset)
t4 <- rpart(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, method="class", data=trainset)

results <- score.tree(results, "body_score", t1, test.b)
results <- score.tree(results, "body_score, inlinks", t2, test.bi)
results <- score.tree(results, "body_score, inlinks, wordcount", t3, test.biw)
results <- score.tree(results, "body_score, inlinks, wordcount, tournaments", t4, test.biwt)

write.table(results, file="HW3.txt", row.names=FALSE)

# after looking at a couple of iterations, I decide that SVM using inlinks, body_score, and wordcount is aces
# either tournaments don't help, or it needs more refining?
# decide to go with the 1:9 partition as it seems to work the best, also, the real testing set is about 1/9 the size of training set
# so here we go, this is for kaggle gold
wc <- apply(qbtest, 1, function(x) {length(strsplit(x['text'], " ")[[1]])}) # need to recalc wordcount!
actualtesting <- data.frame(body_score=qbtest$body_score, inlinks=qbtest$inlinks, wordcount=qbtest$wordcount)

# To make the graph:
library(ggplot2)
sp <- ggplot(qbtrain, aes(x=wordcount)) + geom_histogram(binwidth=5)
sp + facet_wrap( ~ corr, ncol = 2) + ggtitle("Wordplot Distribution vs T/F")


And the results. The best was the 1:9 partition, as below.

"model" "classifier" "score"
"MFC" "None" 0.560133630289532
"body_score" "Logit" 0.670378619153675
"body_score, inlinks" "Logit" 0.679287305122494
"body_score, inlinks, wordcount" "Logit" 0.710467706013363
"body_score, inlinks, wordcount, ACF/Buzzerfest/Open," "Logit" 0.714922048997773
"body_score" "SVM" 0.7728285077951
"body_score, inlinks" "SVM" 0.776169265033408
"body_score, inlinks, wordcount" "SVM" 0.814031180400891
"body_score, inlinks, wordcount, tournaments" "SVM" 0.799554565701559
"body_score" "Decision Tree" 0.762806236080178
"body_score, inlinks" "Decision Tree" 0.779510022271715
"body_score, inlinks, wordcount" "Decision Tree" 0.779510022271715
"body_score, inlinks, wordcount, tournaments" "Decision Tree" 0.779510022271715


2. The code to remove most punctuation from the text field was done in Python.

def cleanup():
""" Given a file of text strings, writes a new file with all non-alphabet
characters and tabs removed. In this case I have specified the file.
"""
cleaned_file = []
input_file = 'text.txt'
reading = open(input_file, 'r')

for line in reading:
newline = ' '.join(line.split())
for char in newline:
if char in '"?./;:(),\!"':
newline = newline.replace(char,'')
else:
continue
cleaned_file.append(newline)

reading.close()

with open('cleaned.txt', mode='wt') as myfile:
myfile.write('\n'.join(cleaned_file))


But as I said, since the answer generation algorithm didn't have this problem, it turned out to be a totally useless feature.
This account has disabled anonymous posting.
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

Profile

helloworld: (Default)
apply(myLife, fuck)

October 2015

S M T W T F S
    123
4 5678910
11121314151617
18192021222324
25262728293031

Most Popular Tags

Style Credit

Expand Cut Tags

No cut tags