#!/usr/bin/env python ## Mat Kovach (matkovach@gmail.com) ## GPLv2 ## 20080601 -- MEK ## First version. Just like me it is ugly, lacks refinement ## and full of bugs. ## This may or may not be part of the RetroSQL (retrosql@yahoogroups.com) ## ## BEGIN CODE HERE import os import sys import getopt import urllib2 import zipfile from urlparse import urljoin import time def geturl(url): try: in_file = urllib2.urlopen(url) except urllib2.URLError, msg: print "Error, %s" % ( msg ) return False except socket.error, (errno, strerror): print "Socket error (%s) for url %s (%s)" % (errno, url, strerror) return False return in_file def unzipfile(file): zf = zipfile.ZipFile(file) for i, name in enumerate(zf.namelist()): if not name.endswith('/'): print "Extracting %s" % name try: outfile = open(os.path.join(os.path.dirname(file), name), 'wb') outfile.write(zf.read(name)) except IOError,msg: print "Error: %s" % ( msg ) return False outfile.flush() outfile.close() def usage(): print "Usage: %s -h --help -d --debug -k --keep -o --output=" % (os.path.basename(sys.argv[0])) print "\t-h, --help Print this message" print "\t-d, --debug Show debugging info" print "\t-o , --output= Download/unzip into directory " print "\tDefaults to: %s" % (os.path.join(os.getcwd(),"retrosheet-files/")) print "\t-k, --keep Keep zipfiles, do not delete." def main(): try: opts, args = getopt.getopt(sys.argv[1:], "ho:dk", ["help", "debug", "output=","keep"]) except getopt.GetoptError, err: print str(err) usage() sys.exit(2) output = False debug = False keep = False for option, arg in opts: if option in ("-d","--debug"): debug = True elif option in ("-h","--help"): usage() sys.exit() elif option in ("-o","--output"): output = arg elif option in ("-k","--keep"): keep = True else: assert False, "unhandled option" if output == False: output = os.path.join(os.getcwd(),"retrosheet") output = os.path.abspath(output) print "DEBUG: %s, %s, %s" % (keep,debug,output) if os.path.exists(output) == False: print "%s does not exists, creating output directory" % ( output ) try: os.mkdir(output) except OSError, e: # Ignore directory exists error if e.errno <> errno.EEXIST: raise else: if os.path.isfile(output) == True: print "%s is a file, exiting ... " % ( output ) sys.exit() base_url = "http://www.retrosheet.org" this_year = int(time.strftime('%Y')) for year in range(1950,this_year): y = str(year) if year < 1997: types = ['al','nl'] else: types = ['ml'] for type in types: web_file = y + '/' + y + type + '.zip' url = urljoin(base_url,web_file) print "Working on %s, %s" % (year, type) in_file = geturl(url) if in_file == False: print "Error, skipping %s" % (url.split('/')[-1]) continue filename = url.split('/')[-1] file = os.path.join(output,filename) if os.path.exists(file): if os.path.isdir(file): print "%s exists, but is a directory!" % ( filename ) sys.exit() else: try: os.remove(file) except Error,msg: print "%s, %s" % ( Error,msg ) print "Removed previous %s" % ( filename ) try: out_file = open(file, "wb") out_file.write(in_file.read()) out_file.flush() out_file.close() in_file.close() except IOError,msg: print "Error writing %s: %s" % ( filename,msg ) unzipfile(file) if keep == False: try: os.remove(file) except IOError,msg: print "%s, %s" % ( Error,msg ) else: print "Removed %s" % ( os.path.basename(file) ) return 0 if __name__ == "__main__": sys.exit(main())