Train and pickle classifier using pre-classified headlines in the database.

import pickle
import nltk
import psycopg2
import string
from xml.dom import minidom


conn = psycopg2.connect("dbname=daniel user=daniel")
cur = conn.cursor()

cur.execute("select headline, sentiment from headlines where sentiment = %s or sentiment = %s",("positive","negetive"))

headlines = cur.fetchall()

#print headlines

hlines = []
for (words, sentiment) in headlines:
	words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
	hlines.append((words_filtered, sentiment))


def get_words_in_headlines(hlines):
	all_words = []
	last_word = ''
	for (words, sentiment) in hlines:
		all_words.extend(words)
	return all_words

def get_word_features(wordlist):
	wordlist = nltk.FreqDist(wordlist)
	word_features = wordlist.keys()
	return word_features

def extract_features(document):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['contains(%s)' % word] = (word in document_words)
	return features


word_features = get_word_features(get_words_in_headlines(hlines))

#avoid having these words influence sentiment.
excludedWords = ["facebook", "uber", "amazon"
                , "samsung", "ivanka","gorsuch"
                ,"asian","north","french","snap's"
                ,"melania","executive","delaware"
                ,"ceo","apple's","apple","travel"
                ,"trump's","president","white","iphone","trump"]

for word in excludedWords:
	if word in word_features: word_features.remove(word)

training_set = nltk.classify.apply_features(extract_features, hlines)

classifier = nltk.NaiveBayesClassifier.train(training_set)

print classifier.show_most_informative_features(50)

output = open('classifier.pkl', 'wb')
pickle.dump(classifier, output)
output.close()
print "classifier is pickled!"



cur.execute("select distinct feed from headlines")
for thing in cur.fetchall():
	cur.execute("select headline from headlines where feed = %s", thing)
	for headline in cur.fetchall():
		dist = classifier.prob_classify(extract_features(headline[0].split()))
		print classifier.classify(extract_features(headline[0].split())), dist.prob("positive"), " - ", headline[0]
		break

#for label in dist.samples():
#	print ("%s: %f" % (label, dist.prob(label)))

conn.commit()
cur.close()
conn.close()