helloworld | DID HW #3 1-3 (Reply)

Wow ... this one was rough.

Deals with classification using logistic regression, SVMs, and decision trees.

Anyway, code to get through #1-3 below cut. All in R. I also did a script in Python but it turned out to be useless.

#4 is just to run this on the testing set provided, so I'm not going to belabor that here.


library(e1071)
library(rpart)
qbtrain <- read.csv("D:/Documents/workspace/qb.train.csv", as.is=TRUE)
qbtest <- read.csv("D:/Documents/workspace/qb.test.csv", as.is=TRUE)

# tried adding a feature factor for whether page title is in text
# qbtrain$newtext <- tolower(qbtrain$text)
# qbtrain$newpage <- tolower(qbtrain$page)
# but everything is false so commenting it out, just showing my work
# did the text cleanup in python
# qbtrain$intext <- grepl(qbtrain$page[[1]], qbtrain$newtext)
# qbtrain$intext <- factor(qbtrain$intext)
# other useless features: indiv. tournaments, category (all lit!)


#add features for words, and being part of specific tournaments
wc <- apply(qbtrain, 1, function(x) {length(strsplit(x['text'], " ")[[1]])})
qbtrain$wordcount <- wc
qbtrain$isacf <- grepl("ACF", qbtrain$tournaments)
qbtrain$isbuzzer <- grepl("Buzzerfest", qbtrain$tournaments)
qbtrain$isopen <- grepl("Open", qbtrain$tournaments)
qbtrain$isacf <- factor(qbtrain$isacf)
qbtrain$isbuzzer <- factor(qbtrain$isbuzzer)
qbtrain$isopen <- factor(qbtrain$isopen)
qbtrain$corr <- factor(qbtrain$corr)

# split into half
trainingindex <- 1:nrow(qbtrain)
selection <- round(max(trainingindex)/9, digits = 0)
testindex <- sample(trainingindex, size=selection)
trainset <- qbtrain[-testindex,]
testset <- qbtrain[testindex,]

test.b <- data.frame(body_score=testset$body_score)
test.bi <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks)
test.biw <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks, wordcount=testset$wordcount)
test.biwt <- data.frame(body_score=testset$body_score, inlinks=testset$inlinks, wordcount=testset$wordcount, isacf=testset$isacf, isbuzzer=testset$isbuzzer, isopen=testset$isopen)

# Baseline:
baseline = sum(testset$corr == "False") / length(testset$corr)

# Set up the answers:
realanswers <- as.logical(testset$corr)
results <- data.frame(model=c("MFC"), classifier=c("None"), score=baseline)

# Calculates score for each logit model
l1 <- glm(corr ~ body_score, data=trainset, family="binomial")
l2 <- glm(corr ~ body_score + inlinks, data=trainset, family="binomial")
l3 <- glm(corr ~ body_score + inlinks + wordcount, data=trainset, family="binomial")
l4 <- glm(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, data=trainset, family="binomial")

score.logit <- function(df, name, model, data) {
  logit.pred <- predict.glm(model, newdata=data)
  guesses <- logit.pred > 0
  matching <- guesses == realanswers
  score <- sum(matching == TRUE) / length(matching)
  df <- rbind(df, data.frame(model=c(name), classifier=c("Logit"), score))
  return(df)
}

results <- score.logit(results, "body_score", l1, test.b)
results <- score.logit(results, "body_score, inlinks", l2, test.bi)
results <- score.logit(results, "body_score, inlinks, wordcount", l3, test.biw)
results <- score.logit(results, "body_score, inlinks, wordcount, ACF/Buzzerfest/Open,", l4, test.biwt)

# Calculates score for each SVM model
score.svm <- function(df, name, model, data) {
  svm.pred <- predict(model, newdata=data)
  svm.pred <- as.logical(svm.pred)
  matching <- svm.pred == realanswers
  score <- sum(matching == TRUE) / length(matching)
  df <- rbind(df, data.frame(model=c(name), classifier=c("SVM"), score))
  return(df)
}

s1 <- svm(corr ~ body_score, data=trainset)
s2 <- svm(corr ~ body_score + inlinks, data=trainset)
s3 <- svm(corr ~ body_score + inlinks + wordcount, data=trainset)
s4 <- svm(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, data=trainset)

results <- score.svm(results, "body_score", s1, test.b)
results <- score.svm(results, "body_score, inlinks", s2, test.bi)
results <- score.svm(results, "body_score, inlinks, wordcount", s3, test.biw)
results <- score.svm(results, "body_score, inlinks, wordcount, tournaments", s4, test.biwt)

# Calculates score for each tree model
score.tree <- function(df, name, model, data) {
  tree.pred <- predict(model, newdata=data, type="class")
  guesses <- as.character(tree.pred)
  guesses <- as.logical(guesses)
  matching <- guesses == realanswers
  score <- sum(matching == TRUE) / length(matching)
  df <- rbind(df, data.frame(model=c(name), classifier=c("Decision Tree"), score))
  return(df)
}

t1 <- rpart(corr ~ body_score, method="class", data=trainset)
t2 <- rpart(corr ~ body_score + inlinks, method="class", data=trainset)
t3 <- rpart(corr ~ body_score + inlinks + wordcount, method="class", data=trainset)
t4 <- rpart(corr ~ body_score + inlinks + wordcount + isacf + isbuzzer + isopen, method="class", data=trainset)

results <- score.tree(results, "body_score", t1, test.b)
results <- score.tree(results, "body_score, inlinks", t2, test.bi)
results <- score.tree(results, "body_score, inlinks, wordcount", t3, test.biw)
results <- score.tree(results, "body_score, inlinks, wordcount, tournaments", t4, test.biwt)

write.table(results, file="HW3.txt", row.names=FALSE)

# after looking at a couple of iterations, I decide that SVM using inlinks, body_score, and wordcount is aces
# either tournaments don't help, or it needs more refining?
# decide to go with the 1:9 partition as it seems to work the best, also, the real testing set is about 1/9 the size of training set
# so here we go, this is for kaggle gold
wc <- apply(qbtest, 1, function(x) {length(strsplit(x['text'], " ")[[1]])}) # need to recalc wordcount!
actualtesting <- data.frame(body_score=qbtest$body_score, inlinks=qbtest$inlinks, wordcount=qbtest$wordcount)

# To make the graph:
library(ggplot2)
sp <- ggplot(qbtrain, aes(x=wordcount)) + geom_histogram(binwidth=5)
sp + facet_wrap( ~ corr, ncol = 2) + ggtitle("Wordplot Distribution vs T/F")

And the results. The best was the 1:9 partition, as below.

"model" "classifier" "score"
"MFC" "None" 0.560133630289532
"body_score" "Logit" 0.670378619153675
"body_score, inlinks" "Logit" 0.679287305122494
"body_score, inlinks, wordcount" "Logit" 0.710467706013363
"body_score, inlinks, wordcount, ACF/Buzzerfest/Open," "Logit" 0.714922048997773
"body_score" "SVM" 0.7728285077951
"body_score, inlinks" "SVM" 0.776169265033408
"body_score, inlinks, wordcount" "SVM" 0.814031180400891
"body_score, inlinks, wordcount, tournaments" "SVM" 0.799554565701559
"body_score" "Decision Tree" 0.762806236080178
"body_score, inlinks" "Decision Tree" 0.779510022271715
"body_score, inlinks, wordcount" "Decision Tree" 0.779510022271715
"body_score, inlinks, wordcount, tournaments" "Decision Tree" 0.779510022271715

2. The code to remove most punctuation from the text field was done in Python.

def cleanup():
    """ Given a file of text strings, writes a new file with all non-alphabet
    characters and tabs removed. In this case I have specified the file.
    """
    cleaned_file = []
    input_file = 'text.txt'
    reading = open(input_file, 'r')

    for line in reading:
        newline = ' '.join(line.split())
        for char in newline:
            if char in '"?./;:(),\!"':
                newline = newline.replace(char,'')
            else:
                continue
        cleaned_file.append(newline)

    reading.close()

    with open('cleaned.txt', mode='wt') as myfile:
        myfile.write('\n'.join(cleaned_file))