|
| 1 | +#! /usr/bin/env python |
| 2 | +import sys |
| 3 | +import string |
| 4 | +import perfect_hash |
| 5 | + |
| 6 | +# This is a user of perfect_hash.py |
| 7 | +# that takes as input the UnicodeData.txt file available from: |
| 8 | +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt |
| 9 | + |
| 10 | +# It generates a hash table from Unicode Character Name -> |
| 11 | +# unicode code space value. |
| 12 | + |
| 13 | +# These variables determine which hash function is tried first. |
| 14 | +# Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/ |
| 15 | +f1Seed = 1694245428 |
| 16 | +f2Seed = -1917331657 |
| 17 | + |
| 18 | +# Maximum allowed multipler, if this isn't None then instead of continually |
| 19 | +# increasing C, it resets it back to initC to keep searching for |
| 20 | +# a solution. |
| 21 | +minC = 1.7875 |
| 22 | +# Initial multiplier for trying to find a perfect hash function. |
| 23 | +initC = 1.7875 |
| 24 | + |
| 25 | +moduleName = "ucnhash" |
| 26 | +dataArrayName = "aucn" |
| 27 | +dataArrayType = "_Py_UnicodeCharacterName" |
| 28 | +headerFileName = "ucnhash.h" |
| 29 | +cFileName = "ucnhash.c" |
| 30 | +structName = "_Py_UCNHashAPI" |
| 31 | + |
| 32 | +keys = [] |
| 33 | +hashData = {} |
| 34 | + |
| 35 | +def generateOutputFiles(perfHash, hashData): |
| 36 | + header = perfHash.generate_header(structName) |
| 37 | + header = header + """ |
| 38 | +typedef struct |
| 39 | +{ |
| 40 | + const char *pszUCN; |
| 41 | + unsigned int uiValue; |
| 42 | +} _Py_UnicodeCharacterName; |
| 43 | +
|
| 44 | +""" |
| 45 | + |
| 46 | + code = perfHash.generate_code(moduleName, |
| 47 | + dataArrayName, |
| 48 | + dataArrayType, |
| 49 | + structName) |
| 50 | + out = open(headerFileName, "w") |
| 51 | + out.write(header) |
| 52 | + out = open(cFileName, "w") |
| 53 | + out.write("#include <%s>\n" % headerFileName) |
| 54 | + out.write(code) |
| 55 | + perfHash.generate_graph(out) |
| 56 | + out.write(""" |
| 57 | + |
| 58 | +static const _Py_UnicodeCharacterName aucn[] = |
| 59 | +{ |
| 60 | +""") |
| 61 | + for i in xrange(len(keys)): |
| 62 | + v = hashData[keys[i][0]] |
| 63 | + out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n") |
| 64 | + out.write("};\n\n") |
| 65 | + sys.stderr.write('\nGenerated output files: \n') |
| 66 | + sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName)) |
| 67 | + |
| 68 | +def main(): |
| 69 | + # Suck in UnicodeData.txt and spit out the generated files. |
| 70 | + input = open(sys.argv[1], 'r') |
| 71 | + i = 0 |
| 72 | + while 1: |
| 73 | + line = input.readline() |
| 74 | + if line == "": break |
| 75 | + fields = string.split(line, ';') |
| 76 | + if len(fields) < 2: |
| 77 | + sys.stderr.write('Ill-formated line!\n') |
| 78 | + sys.stderr.write('line #: %d\n' % (i + 1)) |
| 79 | + sys.exit() |
| 80 | + data, key = fields[:2] |
| 81 | + key = string.strip( key ) |
| 82 | + # Any name starting with '<' is a control, or start/end character, |
| 83 | + # so skip it... |
| 84 | + if key[0] == "<": |
| 85 | + continue |
| 86 | + hashcode = i |
| 87 | + i = i + 1 |
| 88 | + # force the name to uppercase |
| 89 | + keys.append( (string.upper(key),hashcode) ) |
| 90 | + data = string.atoi(data, 16) |
| 91 | + hashData[key] = data |
| 92 | + |
| 93 | + input.close() |
| 94 | + sys.stderr.write('%i key/hash pairs read\n' % len(keys) ) |
| 95 | + perfHash = perfect_hash.generate_hash(keys, 1, |
| 96 | + minC, initC, |
| 97 | + f1Seed, f2Seed, |
| 98 | + # increment, tries |
| 99 | + 0.0025, 50) |
| 100 | + generateOutputFiles(perfHash, hashData) |
| 101 | + |
| 102 | +if __name__ == '__main__': |
| 103 | + if len(sys.argv) == 1: |
| 104 | + sys.stdout = sys.stderr |
| 105 | + print 'Usage: %s <input filename>' % sys.argv[0] |
| 106 | + print ' The input file needs to be UnicodeData.txt' |
| 107 | + sys.exit() |
| 108 | + main() |
| 109 | + |
0 commit comments