Script to download training data from RSS and store headlines in database.


import psycopg2
import re
import string
import sys
import httplib
import urllib2
import datetime
import time
from xml.dom import minidom

# Connect to the postgreSQL DB
conn = psycopg2.connect("dbname=daniel user=daniel")

feedname = ''
dateToday ='%Y%m%d%H')

# Get list of feeds from a file, maybe multiple files...
def loadFeedURLs(fileName):
    #sample file contents.
    infofile = fileName
    datafile = open(infofile, "r")
    line = datafile.readline()

    record = {}

    while line:
        data = string.split(line, ';')
        feedname = data[0]
        address = data[1]
        record[feedname] = address
        line = datafile.readline()

    return record

#lets make this an object incase we want multiples to put in threads.
class feedDownloader:

    def __init__(self, feeddata): = []
        self.feedinfo = feeddata

    def feeddata (self, feedname):
        print self.feedinfo
        self.feedaddress = self.feedinfo[feedname]
        return self.feedaddress

    def links (self, feedname):
        print feedname
        address = self.feeddata(feedname)
        self.file_feed = self.downloadPage(address)
	# this is the feed files.
        self.writeFile('feeds/'+feedname+'-'+dateToday+'.rss', self.file_feed)
        self.file_xml = minidom.parseString(self.file_feed)

        self.item_node = self.file_xml.getElementsByTagName("item")

        self.linkdata = ""

        for idx,item in enumerate(self.item_node):
            title = item.childNodes[0]
            link = item.childNodes[1]

            ftitle =
            flink ="url=")[1]

            self.linkdata = self.linkdata + ftitle + "\n"
            self.linkdata = self.linkdata + flink + "\n"

            #create db cursor using global connection.
            self.cur = conn.cursor()
            self.cur.execute("select count(*) from headlines where url = %s",(flink,))
            if (self.cur.fetchone()[0] == 0):
                #print feed, time, idx, ftitle, flink
                self.cur.execute("INSERT INTO headlines (feed, time, number, headline, url, source) VALUES (%s, %s, %s, %s, %s, %s)", (feedname, dateToday, idx, ftitle.split(" - ")[0], flink, ftitle.split(" - ")[1]))
                article = self.downloadPage(flink)
		#this is a specific article
                self.writeFile('feeds/articles/'+feedname+'-'+dateToday+'-'+str(idx)+'.txt', article)

        return self.linkdata

    def image (self, feedname):
        image_address = imginfo[feedname]
        return image_address

    def downloadPage (self, url):
        file_request = urllib2.Request(url)
        file_opener = urllib2.build_opener()
        file_contents =
        return file_contents

    def writeFile (self, name, contents):
        output_file = open(name, 'w')

def main():
    i = 0
    feeddata = loadFeedURLs("feedlist.dat")
    feed = feedDownloader(feeddata)
    for x in feeddata:
        body = feed.links(x)


if __name__ == "__main__":