#!/usr/bin/env python import sys, re, urllib, time, os, string # right now, fetched and omit are in-memory databasess. # easy to switch them to dbm files though. import pickle if os.path.exists('collate.data'): inf = open('collate.data') fetched = pickle.load(inf) omit = pickle.load(inf) inf.close() else: fetched = {} omit = {} # remember when we started. now = time.time() # do a picky parse of the file. Keep a log of any problems # in the 'log' field. Local policy decides whether to keep or # discard files with problems. Anything useful is kept. def parse(file): fields = {'ip':[], 'include':[], 'omit':[]} log = [] while 1: line = file.readline() if not line: break # now get rid of newline, return and/or trailing spaces. line = string.rstrip(line) # ignore comments and blank lines if not line or line[0] == '#': continue match = re.match('(.*?): *(.*)', line) if not match: log.append("not a #comment, nor field:value in line:\n "+line) continue field, value = match.groups() field = string.lower(field) value = string.rstrip(value) if field == 'omit': fields[field].append(value) elif field == 'ip': # check the correctness of the ip address ranges. if re.search(':', value): log.append("We don't support IPv6 in:\n " + line) continue ipfields = string.split(value, '/') if len(ipfields) == 1: ipfields.append('32') if not re.match(r'\d+$', ipfields[1]): log.append("ip subnet width is not a number in:\n "+line) continue else: subnet = int(ipfields[1]) octets = string.split(ipfields[0],'.') octetweight = 24 ipaddr = 0 for octet in octets: if not re.match(r'^\d+$', octet): log.append("invalid IPv4 address:\n %s" % value) break ipaddr = ipaddr | int(octet) << octetweight octetweight = octetweight - 8 else: mask = -1 << (32-subnet) if ipaddr & ~mask: log.append("some bits are specified (%s) which fall outside the mask (%s) in:\n %s" % (hex(ipaddr),hex(mask),line)) else: fields[field].append(value) elif field == 'include': value = string.split(value) # if none specified, remember that. if len(value) == 1: value.append(None) elif len(value) == 2: if not re.match(r'\d+$', value[1]): log.append("include depth is not a number in:\n "+line) continue else: value[1] = int(value[1]) else: log.append("include has more than two fields:\n "+line) continue fields[field].append(value) elif field == 'keepfor': if not re.match(r'\d+$', value): log.append("keepfor is not a number in: "+line) continue else: fields[field] = int(value) else: fields[field] = value file.close() fields['now'] = now fields['log'] = log return fields # fetch a URL and return the candidates for further fetches. # fetch it from the cache unless the keepfor has expired. def fetch(url, depth): if omit.has_key(url): return [] if fetched.has_key(url): if fetched[url].has_key('keepfor'): keepfor = fetched[url]['keepfor'] else: keepfor = 0 # don't fetch any more often than every 60 seconds. print url, fetched[url]['now'] , keepfor , now if fetched[url]['now'] + keepfor + 60 > now: return fetched[url]['include'] try: file = urllib.URLopener().open(url) except IOError, errno: sys.stderr.write("skipping "+url+"\n") file = open("/dev/null") if debug: print url fields = parse(file) if debug: for f in fields['log']: print f fetched[url] = fields return fields['include'] # implement a breadth-first traversal def breadth(list, distance, examined): pending = [] for url, depth in list: if examined.has_key(url): continue examined[url] = 1 for new in fetch(url, depth): # we already included their entries, but a remaining depth # of 1 means that we don't trust their includes. if depth == 1: continue # if we limit our trust of them: if depth: # if they gave a number, compute the next level of trust. if new[1]: new[1] = min(new[1], depth-1) # they didn't give a number or they gave zero; default to next level. else: new[1] = depth-1 # if we don't limit them, let their native limits flow through. pending.append(new) return pending import getopt # options verbose = 0 # verbose output, for patched rbldns defaultdepth = 0 # if they don't specify any depth in an include: debug = 0 # being loud-mouthed. # given a set of urls for web-o-trust files, create a DNSBL. # we have a bunch of policy decisions to make: # if we encounter an include: with a limit, do we increase our limit? # if we encounter an include: we've already seen, do we change its limit? # # In the following, we trust harry to a level of 3, marty 2, and jane 1. # Harry trusts susie to a level of 1, and betty 2 (which is the maximum for # someone that Harry trusts). # # me --> harry (3) --> susie (1) # | | # | \-> betty (2) # | # +-> marty (2) --> # | | # | \-> betty # | # \-> jane (1) --> truncated # def main(): # set up our default policies try: optlist, args = getopt.getopt(sys.argv[1:], 'vDd:') global verbose, debug, defaultdepth for (opt, val) in optlist: if opt == '-v': verbose = 1 elif opt == '-D': debug = 1 elif opt == '-d': defaultdepth = int(val) else: raise getopt.error, "unrecognized option" if len(args) != 1: raise getopt.error, "wrong number of arguments" url = args[0] except getopt.error, detail: print detail print "usage: collate [-v] [-d ] url" sys.exit() pending = [(url,defaultdepth)] distance = 0 examined = {} while pending: if debug: print "pending:",pending pending = breadth(pending, distance, examined) distance = distance + 1 for url,contents in fetched.items(): for ip in contents['ip']: if verbose: print "%s:127.0.0.2:%s" % (ip, url) else: print ip outf = open('collate.data','w') pickle.dump(fetched, outf) pickle.dump(omit, outf) outf.close() if __name__ == '__main__': main()