Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2101348

Browse files
committed
Fiddled w/ /F's cool new splitbins function: documented it, generalized it
a bit, sped it a lot primarily by removing the unused assumption that None was a legit bin entry (the function doesn't really need to assume that there's anything special about 0), added an optional "trace" argument, and in __debug__ mode added exhaustive verification that the decomposition is both correct and doesn't overstep any array bounds (which wasn't obvious to me from staring at the generated C code -- now I feel safe!). Did not commit a new unicodedata_db.h, as the one produced by this version is identical to the one already checked in.
1 parent 68ded6e commit 2101348

1 file changed

Lines changed: 54 additions & 26 deletions

File tree

Tools/unicode/makeunicodedata.py

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -165,38 +165,66 @@ def getsize(data):
165165
else:
166166
return 4
167167

168-
def splitbins(bins):
169-
# split a sparse integer table into two tables, such as:
170-
# value = t2[(t1[char>>shift]<<shift)+(char&mask)]
171-
# and value == 0 means no data
172-
bytes = sys.maxint
173-
for shift in range(16):
174-
bin1 = []
175-
bin2 = []
168+
def splitbins(t, trace=0):
169+
"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
170+
171+
t is a sequence of ints. This function can be useful to save space if
172+
many of the ints are the same. t1 and t2 are lists of ints, and shift
173+
is an int, chosen to minimize the combined size of t1 and t2 (in C
174+
code), and where for each i in range(len(t)),
175+
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
176+
where mask is a bitmask isolating the last "shift" bits.
177+
178+
If optional arg trace is true (default false), progress info is
179+
printed to sys.stderr.
180+
"""
181+
182+
import sys
183+
if trace:
184+
def dump(t1, t2, shift, bytes):
185+
print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
186+
len(t1), len(t2), shift, bytes)
187+
print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
188+
"bytes"
189+
n = len(t)-1 # last valid index
190+
maxshift = 0 # the most we can shift n and still have something left
191+
if n > 0:
192+
while n >> 1:
193+
n >>= 1
194+
maxshift += 1
195+
del n
196+
bytes = sys.maxint # smallest total size so far
197+
t = tuple(t) # so slices can be dict keys
198+
for shift in range(maxshift + 1):
199+
t1 = []
200+
t2 = []
176201
size = 2**shift
177202
bincache = {}
178-
for i in range(0, len(bins), size):
179-
bin = bins[i:i+size]
180-
index = bincache.get(tuple(bin))
203+
for i in range(0, len(t), size):
204+
bin = t[i:i+size]
205+
index = bincache.get(bin)
181206
if index is None:
182-
index = len(bin2)
183-
bincache[tuple(bin)] = index
184-
for v in bin:
185-
if v is None:
186-
bin2.append(0)
187-
else:
188-
bin2.append(v)
189-
bin1.append(index>>shift)
207+
index = len(t2)
208+
bincache[bin] = index
209+
t2.extend(bin)
210+
t1.append(index >> shift)
190211
# determine memory size
191-
b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
212+
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
213+
if trace:
214+
dump(t1, t2, shift, b)
192215
if b < bytes:
193-
best = shift, bin1, bin2
216+
best = t1, t2, shift
194217
bytes = b
195-
shift, bin1, bin2 = best
196-
## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
197-
## len(bin1), len(bin2), shift, bytes
198-
## )
199-
return bin1, bin2, shift
218+
t1, t2, shift = best
219+
if trace:
220+
print >>sys.stderr, "Best:",
221+
dump(t1, t2, shift, bytes)
222+
if __debug__:
223+
# exhaustively verify that the decomposition is correct
224+
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
225+
for i in xrange(len(t)):
226+
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
227+
return best
200228

201229
if __name__ == "__main__":
202230
maketable()

0 commit comments

Comments
 (0)