#!/usr/bin/python import psycopg2 import re import string import sys import httplib import urllib2 import datetime import time from xml.dom import minidom # Connect to the postgreSQL DB conn = psycopg2.connect("dbname=daniel user=daniel") feedname = '' dateToday = datetime.datetime.today().strftime('%Y%m%d%H') # Get list of feeds from a file, maybe multiple files... def loadFeedURLs(fileName): #sample file contents. #GoogleNewsBusiness;https://news.google.com/news/section?cf=all&pz=1&ned=us&topic=b&output=rss #GoogleNewsStock;https://news.google.com/news/section?cf=all&pz=1&ned=us&topic=stock&output=rss infofile = fileName datafile = open(infofile, "r") line = datafile.readline() record = {} while line: data = string.split(line, ';') feedname = data[0] address = data[1] record[feedname] = address line = datafile.readline() return record #lets make this an object incase we want multiples to put in threads. class feedDownloader: def __init__(self, feeddata): self.data = [] self.feedinfo = feeddata def feeddata (self, feedname): print self.feedinfo self.feedaddress = self.feedinfo[feedname] return self.feedaddress def links (self, feedname): print feedname address = self.feeddata(feedname) self.file_feed = self.downloadPage(address) # this is the feed files. self.writeFile('feeds/'+feedname+'-'+dateToday+'.rss', self.file_feed) self.file_xml = minidom.parseString(self.file_feed) self.item_node = self.file_xml.getElementsByTagName("item") self.linkdata = "" for idx,item in enumerate(self.item_node): title = item.childNodes[0] link = item.childNodes[1] ftitle = title.firstChild.data flink = link.firstChild.data.rsplit("url=")[1] self.linkdata = self.linkdata + ftitle + "\n" self.linkdata = self.linkdata + flink + "\n" #create db cursor using global connection. self.cur = conn.cursor() self.cur.execute("select count(*) from headlines where url = %s",(flink,)) if (self.cur.fetchone()[0] == 0): #print feed, time, idx, ftitle, flink self.cur.execute("INSERT INTO headlines (feed, time, number, headline, url, source) VALUES (%s, %s, %s, %s, %s, %s)", (feedname, dateToday, idx, ftitle.split(" - ")[0], flink, ftitle.split(" - ")[1])) conn.commit() self.cur.close() try: article = self.downloadPage(flink) #this is a specific article self.writeFile('feeds/articles/'+feedname+'-'+dateToday+'-'+str(idx)+'.txt', article) except: pass return self.linkdata def image (self, feedname): image_address = imginfo[feedname] return image_address def downloadPage (self, url): file_request = urllib2.Request(url) file_opener = urllib2.build_opener() file_contents = file_opener.open(file_request).read() return file_contents def writeFile (self, name, contents): output_file = open(name, 'w') output_file.write(contents) output_file.close() def main(): i = 0 feeddata = loadFeedURLs("feedlist.dat") feed = feedDownloader(feeddata) for x in feeddata: print(x) body = feed.links(x) print(body) #time.sleep(5) conn.close() if __name__ == "__main__": main()