#!/usr/bin/python from config import * import re from urlparse import urlparse from urllib import unquote_plus,unquote from xml.dom import minidom import os, re, sys, sgmllib, time, urllib, urlparse from xml.sax.saxutils import escape from pingback import pingback import technorati import urllib, sys, socket, time, datetime, gzip, os, re try: import GeoIP geo = GeoIP.new(GeoIP.GEOIP_STANDARD) except: geo = None regex = re.compile( LOG_FORMAT ) pingable = re.compile( PINGABLE ) trace = True # parse a html page, looking for feeds class html(sgmllib.SGMLParser): def __init__(self, url): self.feedurl = None self.urlbase = url[:url.rfind('/',8)+1].lower() self.intitle = False self.title = "" sgmllib.SGMLParser.__init__(self) try: if url: self.feed(urllib.URLopener().open(url).read()) except: pass # def start_link(self, attrs): attrs = dict(map(lambda (k,v): (k.lower(),v), attrs)) if not 'rel' in attrs: return rels = attrs['rel'].split(' ') if 'alternate' not in rels: return if not 'type' in attrs or not attrs['type'].endswith('xml'): return if 'href' in attrs: if not self.feedurl: self.feedurl = attrs['href'] # def start_a(self, attrs): if self.feedurl: return attrs = dict(map(lambda (k,v): (k.lower(),v.lower()), attrs)) if 'href' in attrs: href = attrs['href'] if self.urlbase == href[:href.rfind('/',8)+1]: if href[href.rfind('/',8)+1:] in common_feed_names: self.feedurl = href # title def do_title(self, attrs): if self.title=="": self.intitle=1 def unknown_starttag(self, tag, attrs): self.intitle=0 def unknown_endtag(self,tag): self.intitle=0 def handle_charref(self, ref): if self.intitle: self.title = self.title + ("&#%s;" % ref) def handle_data(self,text): if self.intitle: self.title = self.title + text # return the text associated with a given DOM node def text(element, tag): nodes = element.getElementsByTagName(tag) if not nodes: return "" attrs=dict(nodes[0].attributes) if 'mode' in attrs and attrs['mode'].value=='xml': return innerxml(element, nodes[0].namespaceURI, tag) elif 'type' in attrs and attrs['type'].value=='application/xhtml+xml': return innerxml(element, nodes[0].namespaceURI, tag) elif tag in ['content','summary'] and ('type' not in attrs or attrs['type'].value in ['xhtml','plain']): return innerxml(element, nodes[0].namespaceURI, tag) else: return "".join([getattr(child,'data','') for child in nodes[0].childNodes]) # return the innerxml associated with a given DOM node def innerxml(element, ns, tag): nodes = element.getElementsByTagNameNS(ns, tag) if not nodes: return "" value=nodes[0].toxml() return value[value.find('>')+1:value.rfind('<')] def extract(entry, base=''): attrs=dict(entry.attributes) if 'xml:base' in attrs: if base: base=urlparse.urljoin(base,attrs['xml:base'].value) else: base = attrs['xml:base'] title=text(entry,'title') ref='' alternate = None if 'rdf:about' in dict(entry.attributes): alternate=dict(entry.attributes)['rdf:about'].value if not alternate: guid=entry.getElementsByTagName('guid')[:1] if guid and guid[0].getAttribute('isPermaLink') in ('','true'): alternate=text(entry,'guid') for link in entry.getElementsByTagName('link'): attrs=dict(link.attributes) # print [(key, attrs[key].value) for key in attrs.keys()] if (not 'rel' in attrs) or attrs['rel'].value=='alternate': if (not 'type' in attrs) or attrs['type'].value.find('html')>=0: if 'href' in attrs: alternate=alternate or attrs['href'].value ref = ref or text(entry,'link') alternate = alternate or ref if alternate and base: alternate=urlparse.urljoin(base,alternate) if alternate and alternate.startswith("http://blogdex.net/route.asp?"): alternate=alternate.replace("/route.asp?","/track.asp?") if base.startswith("http://del.icio.us/") and base.find("/inbox/")<0: if base<>'http://del.icio.us/rss' and alternate.find("intertwingly")>=0: ref=alternate from md5 import md5 alternate="http://del.icio.us/url/%s" % md5(alternate).hexdigest() summary=text(entry,'summary') or text(entry,'description') content=(text(entry,'content') or innerxml(entry,'http://www.w3.org/1999/xhtml','body') or text(entry,'content:encoded') or summary) if ref: content = content + " " + ref if base and base.startswith("http://archipelago.phrasewise.com"): match=re.compile('\[(.*)\].*'200': continue if referer == '-': continue if referer.startswith('.'): continue if referer.find('/search?') > 0: continue if referer.find('/search?') > 0: continue if referer.find('torrez.us') > 0: continue if referer.find('bloglines.com/myblogs_display') > 0: continue if referer.find('thauvin.net') >0: continue if referer.find('diveintomark.blogspot.com')>0: continue if referer.find('bolli.homeip.net')>0: continue if referer.find('fozbaca.org/blagg')>0: continue if referer.find('/aggsome.cgi/')>0: continue if referer.find('20six.co.uk')>0: continue if referer.find('/mediajunkie.com/')>0: continue if referer.find('/treesalive.com/')>0: continue if referer.find('automated.adsensemoney.net')>0: continue if referer.find('dcostanet.net/rss')>0: continue if not referer in referers: referers.append(referer) referers.reverse() feeds = [] technorati.setLicense( TECHNORATI_KEY ) # add in technorati found links since last scan if trace: print "+++ fetch technorati" try: items=technorati.getCosmos('http://torrez.us')['inbound'] except: items=[] for item in items: rssurl=item['weblog']['rssurl'] if rssurl.find('thauvin.net')>0: continue if not rssurl: rssurl=html(item['weblog']['url']+'/').feedurl if rssurl and rssurl.find('/mediajunkie.com/')>0: continue if trace and rssurl: print rssurl if rssurl and not rssurl in feeds: feeds.append(rssurl) try: import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py timeoutsocket.setDefaultSocketTimeout(10) except ImportError: pass print "following referrers..." for refer in referers: if refer.startswith('http://www.google.'): continue if refer.startswith('http://xmlns.com/foaf'): continue if refer.find('8z21-7pie.blogspot.com')>0: continue if trace: print refer try: feedurl = html(refer).feedurl if feedurl.startswith('feed://'): feedurl="http" + feedurl[4:] if not feedurl: continue # resolve relative urls feedurl = urlparse.urljoin(refer,feedurl) if not feedurl in feeds: feeds.append(feedurl) except: pass print "fetching feeds..." for url in feeds: if trace: print url try: feed = minidom.parseString(urllib.urlopen(url).read()) blog=text(feed,'title') or html(text(feed,'link')).title attrs=dict(feed.documentElement.attributes) if 'xml:base' in attrs: base=urlparse.urljoin(url,attrs['xml:base'].value) else: base=url entries = feed.getElementsByTagName('entry') entries = entries or feed.getElementsByTagName('item') for entry in entries: try: (title, alternate, summary, content) = extract(entry, base) if not alternate: continue if alternate.find("http://torrez.us") > 0: continue for target in pingable.findall(content): print target print alternate file = urllib.urlopen(target.split('#')[0]) page = parser() page.feed(file.read()) file.close() if alternate in page.hrefs: continue if pingable.match(alternate): continue pingback(alternate, target) except: if trace: import traceback, sys print "".join(apply(traceback.format_exception, sys.exc_info())) print url print title print alternate print content print else: pass except: if trace: import traceback, sys print "".join(apply(traceback.format_exception, sys.exc_info())) print url print else: pass