# (c) 2007-2010 IBM Corporation and Others. All Rights Reserved.
# Python module for scanning and mirroring CLDR references.
#
# Steven R. Loomis. Oct 30th, 2007
#
#
# usage:  refmirror.pl  /path/to/cldr/common   /path/to/nonexistent/refmirror-output-dir
#
# note:
#  - does condense duplicate URLs within a locale, to only download once
#      (should condense globally.)
#  - requires 'wget' installed.
# 
# todo:
#  - only handles <references> formats - so CLDR 1.5 main/ but NOT collation/
#  - doesn't escape UTF-8 URLs such as wikipedia ( writes out url in utf-8, does not %-encode )
#  - should probably pass "-n 2" or such to wget to shorten hang time
#

from xml.dom import minidom
import sys
import os
import codecs

progname = sys.argv[0]

if len(sys.argv) != 3:
    raise RuntimeError, "Usage: %s  <cldrroot> <output dir>"%(sys.argv[0])

cldrdir = sys.argv[1]
htmldir = sys.argv[2]

print "# creating %s (shouldn't exist)" % htmldir
os.mkdir(htmldir)

print "# walking %s" % cldrdir

dirs = os.walk(cldrdir)

for dir in dirs:
    name = dir[0]
    subdirs = dir[1]
    files = dir[2]
    if(name.endswith("/CVS")):
        continue
    leaf=name[len(cldrdir):]
    if(leaf.startswith('/')):
        leaf=leaf[1:]
    print "dir: %s" % str(leaf)
    out = "%s/%s" % (htmldir,leaf)
    if(len(leaf)>0):
        os.mkdir(out)
    for file in files:
        if not file.endswith('.xml'):
            continue
        
        # hash of already read items
        alreadyread = {}
        
        # stub?
        stub = file
        xmldir = "%s/%s" % (out,file)
        
        # read file
        filepath = "%s/%s"%(name,file)
        dom = minidom.parse(filepath)
        
        nodes = dom.childNodes
        
        if not (nodes[1].nodeType == 1):
            continue
        refNode = nodes[1].getElementsByTagName('references')
        
        if not refNode:
            #print "## no refnode %s" % filepath
            continue
        print "## got refnode %s" % filepath
        os.mkdir(xmldir)
        for ref in refNode[0].getElementsByTagName('reference'):
            #print "## - ref %s" % str(ref)
            if not ref.hasAttribute('type'):
                print "## untyped reference in %s" % filepath
            else:
                type = ref.getAttribute('type')
                
                if ref.hasAttribute('alt'):
                    type = "%s-%s" % (type,ref.getAttribute('alt'))
                
                typedir = "%s/%s" % (xmldir,type)
                
                
                if not ref.hasAttribute('uri'):
                    #print "# No 'uri' attribute on %s / %s"%(file,type)
                    continue
                uri = ref.getAttribute('uri')
                if uri.startswith('urn:'):
                    uri = uri[len('urn:'):]
                if uri.startswith('isbn'):
                    # assume ISBN can fend for itself
                    continue
                if uri.startswith('ISBN'):
                    # assume ISBN can fend for itself
                    continue
                if not uri.startswith('http'):
                    print "# Not a known scheme: %s on %s / %s"%(uri,file,type)
                    continue;
                #print uri
                # write the Info file
                file = open("%s.xml"%typedir, 'w')
                file.write( codecs.BOM_UTF8 )
                file.write(ref.toxml().encode( "utf-8" ))
                file.close()
                # make the dir..
                os.mkdir(typedir)
                
                # already read it?
                if uri in alreadyread.keys():
                    already = alreadyread[uri]
                    alfile = open("%s/duplicate.txt"%typedir,'w')
                    alfile.write( ("%s\n"%already).encode("utf-8"))
                    alfile.close()
                else:
                    alreadyread[uri] = type
                    cmd = "wget -P '%s' -nd -np -k -p '%s' 2>&1 > %s.err"%(typedir,uri,typedir)
                    print cmd.encode("utf-8")
                    try:
                        os.system(cmd)
                    except Exception,e:
                        exfile = open("%s.exc"%typedir,'w')
                        exfile.write( ("exception: %s\n"%str(e)).encode("utf-8"))
                        exfile.close()
                        print "%s - exception %s"%(typedir,str(e))