# SWIG Mirror Script, v. 0.0 # July 6, 1997 # Dave Beazley (beazley@cs.utah.edu) # # This Python script mirrors the SWIG WWW site. All links # from the master pages are copied to the local host relative # to the current working directory (ie. this script should be # run in the same directory where you want the web-pages to go). # # How to run : # ------------ # # 1. Edit the setting at the top of this file. Most # importantly, edit the line 'newbase' to point to # the full URL where your SWIG mirror is going to be # located. # # 2. The variables 'base' and 'ftpbase' point to the # SWIG master distribution you are going to mirror. # # 3. Edit the hostinfo string to point to your local # site. This text will automatically get placed # into the SWIG master pages. # # What does this script do? # ------------------------- # # 1. All active links from the SWIG main page are copied # to the current working directory. Subdirectories # will be created as needed. Inline images are also # grabbed from the pages. # # 2. All non-relative links involving the 'base' variable # below are replaced with the value of 'newbase' you # provide. # # 3. All links to files on the SWIG FTP server (ftpbase) # are copied into a subdirectory directory FTP. This # is *not* a full mirror of the SWIG FTP server, but # it is a mirror of all files that are linked directly # from the WWW pages. Links to these pages are also # updated automatically. # # 4. File permissions should be set accordingly during # the mirror process. # # Notes: # ------ # 1. This script is a hack # 2. Mirroring the entire site takes about 10-20 minutes # depending on how busy the University of Utah # servers are. The lowest traffic period is between # 04:00-06:00 US mountain standard time (this is about # 12:00 in Europe). # 3. The SWIG web-page is not updated daily. A weekly # mirror is more than sufficient. # 4. The script downloads everything. It might be # more intelligent to check to see if files have # changed, but I haven't work that out yet. # 5. Sometimes the script terminates with a weird # exception in .__del__. Don't worry about this. # # Please report problems to beazley@cs.utah.edu # newbase ="http://bifrost.lanl.gov/~dmb/SWIG/" base = "http://www.cs.utah.edu/~beazley/SWIG/" ftpbase = "ftp://ftp.cs.utah.edu/pub/beazley/SWIG/" hostinfo = """

This mirror is hosted by the Condensed Matter and Statistical Physics group at Los Alamos National Laboratory. """ # Don't touch anything below here import urllib import htmllib import formatter import posix import string import urlparse import regsub filter = base class MyParser(htmllib.HTMLParser): def __init__(self, formatter): htmllib.HTMLParser.__init__(self,formatter) self.images = [] def handle_image(self,src,alt,ismap=None, align=None, width=None, height=None): self.images.append(src) # Replaces occurrence of non-relative SWIG page URL with # new location def replace_links(str): b = '"'+base nb = '"'+newbase if string.find(str,b) >= 0: print "Replacing link ", base str = regsub.gsub(b,nb,str) # Replace any FTP links b = '"'+ftpbase nb = '"'+newbase+"FTP/" str = regsub.gsub(b,nb,str) return regsub.gsub("",hostinfo,str) def grab_url(url,dest): print "Getting ", url, " --> ", dest try: u = urllib.urlopen(url) str = u.read(10000000) u.close() h = MyParser(formatter.NullFormatter()) h.feed(str) str = replace_links(str) mkdirs(dest) f = open(dest,"wb") f.write(str) f.close() posix.system("chmod 644 "+dest) return (h.anchorlist, h.images) del h except: print "Unable to open ", url return None def grab_image(url,dest): print "Getting image ", url, " --> ", dest try: u = urllib.urlopen(url) str = u.read(10000000) u.close() mkdirs(dest) f = open(dest,"wb") f.write(str) f.close() posix.system("chmod 644 "+dest) except: print "Unable to get ", url # Code for managing subdirectories under me subdirs = [] # Split a file name into components and try to make subdirectories def mkdirs(file): dirs = string.split(file,"/") str = "." for d in dirs[:-1]: str = str + "/" + d if str in subdirs: pass else: print "Making ", str try: posix.system("mkdir "+str) posix.system("chmod 755 "+str) except: pass subdirs.append(str) urls = [] # Gather all URLs and make a local copy def gather(url): if url in urls: return urls.append(url) # Check to see if the URL is a SWIG FTP access if (url[0:len(ftpbase)] == ftpbase): # Yep. We're going to play some games # With it s = url[len(ftpbase):] if (string.find(s,".")) >= 0: dest = "FTP/" + url[len(ftpbase):] # Go grab it grab_url(url,dest) return else: return # Check to make sure the full URL complys with our filter if (url[0:len(filter)] != filter): return # Build the destination directory (from our base) dest = url[len(base):] # Try to grab the first URL d = grab_url(url,dest) # Now, we're going to go through and grab links and images try: links = d[0] imgs = d[1] except: links = [] imgs = [] for l in links: newu = urlparse.urljoin(url,l,0) if string.find(newu,'#') < 0: gather(newu) for i in imgs: newu = urlparse.urljoin(url,i,0) dest = newu[len(base):] if newu in urls: pass else: grab_image(newu,dest) urls.append(newu) gather(base+"index.html")