Script to download training data from RSS and store headlines in database.

#!/usr/bin/python

import psycopg2
import re
import string
import sys
import httplib
import urllib2
import datetime
import time
from xml.dom import minidom


# Connect to the postgreSQL DB
conn = psycopg2.connect("dbname=daniel user=daniel")

feedname = ''
dateToday = datetime.datetime.today().strftime('%Y%m%d%H')

# Get list of feeds from a file, maybe multiple files...
def loadFeedURLs(fileName):
    #sample file contents.
    #GoogleNewsBusiness;https://news.google.com/news/section?cf=all&pz=1&ned=us&topic=b&output=rss
    #GoogleNewsStock;https://news.google.com/news/section?cf=all&pz=1&ned=us&topic=stock&output=rss
    infofile = fileName
    datafile = open(infofile, "r")
    line = datafile.readline()

    record = {}

    while line:
        data = string.split(line, ';')
        feedname = data[0]
        address = data[1]
        record[feedname] = address
        line = datafile.readline()

    return record


#lets make this an object incase we want multiples to put in threads.
class feedDownloader:

    def __init__(self, feeddata):
        self.data = []
        self.feedinfo = feeddata

    def feeddata (self, feedname):
        print self.feedinfo
        self.feedaddress = self.feedinfo[feedname]
        return self.feedaddress

    def links (self, feedname):
        print feedname
        address = self.feeddata(feedname)
        self.file_feed = self.downloadPage(address)
	# this is the feed files.
        self.writeFile('feeds/'+feedname+'-'+dateToday+'.rss', self.file_feed)
        self.file_xml = minidom.parseString(self.file_feed)

        self.item_node = self.file_xml.getElementsByTagName("item")

        self.linkdata = ""

        for idx,item in enumerate(self.item_node):
            title = item.childNodes[0]
            link = item.childNodes[1]


            ftitle = title.firstChild.data
            flink = link.firstChild.data.rsplit("url=")[1]

            self.linkdata = self.linkdata + ftitle + "\n"
            self.linkdata = self.linkdata + flink + "\n"

            #create db cursor using global connection.
            self.cur = conn.cursor()
            
            self.cur.execute("select count(*) from headlines where url = %s",(flink,))
            if (self.cur.fetchone()[0] == 0):
                #print feed, time, idx, ftitle, flink
                self.cur.execute("INSERT INTO headlines (feed, time, number, headline, url, source) VALUES (%s, %s, %s, %s, %s, %s)", (feedname, dateToday, idx, ftitle.split(" - ")[0], flink, ftitle.split(" - ")[1]))
                conn.commit()
                self.cur.close()
            try:
                article = self.downloadPage(flink)
		#this is a specific article
                self.writeFile('feeds/articles/'+feedname+'-'+dateToday+'-'+str(idx)+'.txt', article)
            except:
                pass

        return self.linkdata

    def image (self, feedname):
        image_address = imginfo[feedname]
        return image_address

    def downloadPage (self, url):
        file_request = urllib2.Request(url)
        file_opener = urllib2.build_opener()
        file_contents = file_opener.open(file_request).read()
        return file_contents

    def writeFile (self, name, contents):
        output_file = open(name, 'w')
        output_file.write(contents)
        output_file.close()

def main():
    i = 0
    feeddata = loadFeedURLs("feedlist.dat")
    feed = feedDownloader(feeddata)
    
    for x in feeddata:
        print(x)
        body = feed.links(x)
        print(body)
	#time.sleep(5)

    conn.close()

if __name__ == "__main__":
    main()