#!/usr/bin/python
from config import *
import re
from urlparse import urlparse
from urllib import unquote_plus,unquote
from xml.dom import minidom
import os, re, sys, sgmllib, time, urllib, urlparse
from xml.sax.saxutils import escape
from pingback import pingback
import technorati
import urllib, sys, socket, time, datetime, gzip, os, re
try:
import GeoIP
geo = GeoIP.new(GeoIP.GEOIP_STANDARD)
except:
geo = None
regex = re.compile( LOG_FORMAT )
pingable = re.compile( PINGABLE )
trace = True
# parse a html page, looking for feeds
class html(sgmllib.SGMLParser):
def __init__(self, url):
self.feedurl = None
self.urlbase = url[:url.rfind('/',8)+1].lower()
self.intitle = False
self.title = ""
sgmllib.SGMLParser.__init__(self)
try:
if url: self.feed(urllib.URLopener().open(url).read())
except:
pass
#
def start_link(self, attrs):
attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
if not 'rel' in attrs: return
rels = attrs['rel'].split(' ')
if 'alternate' not in rels: return
if not 'type' in attrs or not attrs['type'].endswith('xml'): return
if 'href' in attrs:
if not self.feedurl:
self.feedurl = attrs['href']
#
def start_a(self, attrs):
if self.feedurl: return
attrs = dict(map(lambda (k,v): (k.lower(),v.lower()), attrs))
if 'href' in attrs:
href = attrs['href']
if self.urlbase == href[:href.rfind('/',8)+1]:
if href[href.rfind('/',8)+1:] in common_feed_names:
self.feedurl = href
# title
def do_title(self, attrs):
if self.title=="": self.intitle=1
def unknown_starttag(self, tag, attrs):
self.intitle=0
def unknown_endtag(self,tag):
self.intitle=0
def handle_charref(self, ref):
if self.intitle: self.title = self.title + ("%s;" % ref)
def handle_data(self,text):
if self.intitle: self.title = self.title + text
# return the text associated with a given DOM node
def text(element, tag):
nodes = element.getElementsByTagName(tag)
if not nodes: return ""
attrs=dict(nodes[0].attributes)
if 'mode' in attrs and attrs['mode'].value=='xml':
return innerxml(element, nodes[0].namespaceURI, tag)
elif 'type' in attrs and attrs['type'].value=='application/xhtml+xml':
return innerxml(element, nodes[0].namespaceURI, tag)
elif tag in ['content','summary'] and ('type' not in attrs or attrs['type'].value in ['xhtml','plain']):
return innerxml(element, nodes[0].namespaceURI, tag)
else:
return "".join([getattr(child,'data','') for child in nodes[0].childNodes])
# return the innerxml associated with a given DOM node
def innerxml(element, ns, tag):
nodes = element.getElementsByTagNameNS(ns, tag)
if not nodes: return ""
value=nodes[0].toxml()
return value[value.find('>')+1:value.rfind('<')]
def extract(entry, base=''):
attrs=dict(entry.attributes)
if 'xml:base' in attrs:
if base:
base=urlparse.urljoin(base,attrs['xml:base'].value)
else:
base = attrs['xml:base']
title=text(entry,'title')
ref=''
alternate = None
if 'rdf:about' in dict(entry.attributes):
alternate=dict(entry.attributes)['rdf:about'].value
if not alternate:
guid=entry.getElementsByTagName('guid')[:1]
if guid and guid[0].getAttribute('isPermaLink') in ('','true'):
alternate=text(entry,'guid')
for link in entry.getElementsByTagName('link'):
attrs=dict(link.attributes)
# print [(key, attrs[key].value) for key in attrs.keys()]
if (not 'rel' in attrs) or attrs['rel'].value=='alternate':
if (not 'type' in attrs) or attrs['type'].value.find('html')>=0:
if 'href' in attrs: alternate=alternate or attrs['href'].value
ref = ref or text(entry,'link')
alternate = alternate or ref
if alternate and base: alternate=urlparse.urljoin(base,alternate)
if alternate and alternate.startswith("http://blogdex.net/route.asp?"):
alternate=alternate.replace("/route.asp?","/track.asp?")
if base.startswith("http://del.icio.us/") and base.find("/inbox/")<0:
if base<>'http://del.icio.us/rss' and alternate.find("intertwingly")>=0:
ref=alternate
from md5 import md5
alternate="http://del.icio.us/url/%s" % md5(alternate).hexdigest()
summary=text(entry,'summary') or text(entry,'description')
content=(text(entry,'content') or
innerxml(entry,'http://www.w3.org/1999/xhtml','body') or
text(entry,'content:encoded') or summary)
if ref: content = content + " " + ref
if base and base.startswith("http://archipelago.phrasewise.com"):
match=re.compile('\[(.*)\].*'200': continue
if referer == '-': continue
if referer.startswith('.'): continue
if referer.find('/search?') > 0: continue
if referer.find('/search?') > 0: continue
if referer.find('torrez.us') > 0: continue
if referer.find('bloglines.com/myblogs_display') > 0: continue
if referer.find('thauvin.net') >0: continue
if referer.find('diveintomark.blogspot.com')>0: continue
if referer.find('bolli.homeip.net')>0: continue
if referer.find('fozbaca.org/blagg')>0: continue
if referer.find('/aggsome.cgi/')>0: continue
if referer.find('20six.co.uk')>0: continue
if referer.find('/mediajunkie.com/')>0: continue
if referer.find('/treesalive.com/')>0: continue
if referer.find('automated.adsensemoney.net')>0: continue
if referer.find('dcostanet.net/rss')>0: continue
if not referer in referers:
referers.append(referer)
referers.reverse()
feeds = []
technorati.setLicense( TECHNORATI_KEY )
# add in technorati found links since last scan
if trace: print "+++ fetch technorati"
try:
items=technorati.getCosmos('http://torrez.us')['inbound']
except:
items=[]
for item in items:
rssurl=item['weblog']['rssurl']
if rssurl.find('thauvin.net')>0: continue
if not rssurl: rssurl=html(item['weblog']['url']+'/').feedurl
if rssurl and rssurl.find('/mediajunkie.com/')>0: continue
if trace and rssurl: print rssurl
if rssurl and not rssurl in feeds: feeds.append(rssurl)
try:
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
timeoutsocket.setDefaultSocketTimeout(10)
except ImportError:
pass
print "following referrers..."
for refer in referers:
if refer.startswith('http://www.google.'): continue
if refer.startswith('http://xmlns.com/foaf'): continue
if refer.find('8z21-7pie.blogspot.com')>0: continue
if trace: print refer
try:
feedurl = html(refer).feedurl
if feedurl.startswith('feed://'): feedurl="http" + feedurl[4:]
if not feedurl: continue
# resolve relative urls
feedurl = urlparse.urljoin(refer,feedurl)
if not feedurl in feeds:
feeds.append(feedurl)
except:
pass
print "fetching feeds..."
for url in feeds:
if trace: print url
try:
feed = minidom.parseString(urllib.urlopen(url).read())
blog=text(feed,'title') or html(text(feed,'link')).title
attrs=dict(feed.documentElement.attributes)
if 'xml:base' in attrs:
base=urlparse.urljoin(url,attrs['xml:base'].value)
else:
base=url
entries = feed.getElementsByTagName('entry')
entries = entries or feed.getElementsByTagName('item')
for entry in entries:
try:
(title, alternate, summary, content) = extract(entry, base)
if not alternate: continue
if alternate.find("http://torrez.us") > 0: continue
for target in pingable.findall(content):
print target
print alternate
file = urllib.urlopen(target.split('#')[0])
page = parser()
page.feed(file.read())
file.close()
if alternate in page.hrefs:
continue
if pingable.match(alternate): continue
pingback(alternate, target)
except:
if trace:
import traceback, sys
print "".join(apply(traceback.format_exception, sys.exc_info()))
print url
print title
print alternate
print content
print
else:
pass
except:
if trace:
import traceback, sys
print "".join(apply(traceback.format_exception, sys.exc_info()))
print url
print
else:
pass