import pickle import nltk import psycopg2 import string from xml.dom import minidom conn = psycopg2.connect("dbname=daniel user=daniel") cur = conn.cursor() cur.execute("select headline, sentiment from headlines where sentiment = %s or sentiment = %s",("positive","negetive")) headlines = cur.fetchall() #print headlines hlines = [] for (words, sentiment) in headlines: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] hlines.append((words_filtered, sentiment)) def get_words_in_headlines(hlines): all_words = [] last_word = '' for (words, sentiment) in hlines: all_words.extend(words) return all_words def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features word_features = get_word_features(get_words_in_headlines(hlines)) #avoid having these words influence sentiment. excludedWords = ["facebook", "uber", "amazon" , "samsung", "ivanka","gorsuch" ,"asian","north","french","snap's" ,"melania","executive","delaware" ,"ceo","apple's","apple","travel" ,"trump's","president","white","iphone","trump"] for word in excludedWords: if word in word_features: word_features.remove(word) training_set = nltk.classify.apply_features(extract_features, hlines) classifier = nltk.NaiveBayesClassifier.train(training_set) print classifier.show_most_informative_features(50) output = open('classifier.pkl', 'wb') pickle.dump(classifier, output) output.close() print "classifier is pickled!" cur.execute("select distinct feed from headlines") for thing in cur.fetchall(): cur.execute("select headline from headlines where feed = %s", thing) for headline in cur.fetchall(): dist = classifier.prob_classify(extract_features(headline[0].split())) print classifier.classify(extract_features(headline[0].split())), dist.prob("positive"), " - ", headline[0] break #for label in dist.samples(): # print ("%s: %f" % (label, dist.prob(label))) conn.commit() cur.close() conn.close()