|
45 | 45 | this efficiently by caching the direct FFTs. |
46 | 46 |
|
47 | 47 | = record array helper functions = |
| 48 | + * rec2txt : pretty print a record array |
48 | 49 | * rec2txt : pretty print a record array |
49 | 50 | * rec2csv : store record array in CSV file |
50 | 51 | * csv2rec : import record array from CSV file with type inspection |
@@ -2113,6 +2114,139 @@ def key_desc(name): |
2113 | 2114 | return newrec.view(npy.recarray) |
2114 | 2115 |
|
2115 | 2116 |
|
| 2117 | +def rec_groupby(r, groupby, stats): |
| 2118 | + """ |
| 2119 | + r is a numpy record array |
| 2120 | +
|
| 2121 | + groupby is a sequence of record array attribute names that |
| 2122 | + together form the grouping key. eg ('date', 'productcode') |
| 2123 | +
|
| 2124 | + stats is a sequence of (attr, func, outname) which will call x = |
| 2125 | + func(attr) and assign x to the record array output with attribute |
| 2126 | + outname. |
| 2127 | + Eg, stats = ( ('sales', len, 'numsales'), ('sales', npy.mean, 'avgsale') ) |
| 2128 | +
|
| 2129 | + return record array has dtype names for each attribute name in in |
| 2130 | + the the 'groupby' argument, with the associated group values, and |
| 2131 | + for each outname name in the stats argument, with the associated |
| 2132 | + stat summary output |
| 2133 | + """ |
| 2134 | + # build a dictionary from groupby keys-> list of indices into r with |
| 2135 | + # those keys |
| 2136 | + rowd = dict() |
| 2137 | + for i, row in enumerate(r): |
| 2138 | + key = tuple([row[attr] for attr in groupby]) |
| 2139 | + rowd.setdefault(key, []).append(i) |
| 2140 | + |
| 2141 | + # sort the output by groupby keys |
| 2142 | + keys = rowd.keys() |
| 2143 | + keys.sort() |
| 2144 | + |
| 2145 | + rows = [] |
| 2146 | + for key in keys: |
| 2147 | + row = list(key) |
| 2148 | + # get the indices for this groupby key |
| 2149 | + ind = rowd[key] |
| 2150 | + thisr = r[ind] |
| 2151 | + # call each stat function for this groupby slice |
| 2152 | + row.extend([func(thisr[attr]) for attr, func, outname in stats]) |
| 2153 | + rows.append(row) |
| 2154 | + |
| 2155 | + # build the output record array with groupby and outname attributes |
| 2156 | + attrs, funcs, outnames = zip(*stats) |
| 2157 | + names = list(groupby) |
| 2158 | + names.extend(outnames) |
| 2159 | + return npy.rec.fromrecords(rows, names=names) |
| 2160 | + |
| 2161 | + |
| 2162 | + |
| 2163 | +def rec_summarize(r, summaryfuncs): |
| 2164 | + """ |
| 2165 | + r is a numpy record array |
| 2166 | +
|
| 2167 | + summaryfuncs is a list of (attr, func, outname) which will |
| 2168 | + apply codefunc to the the array r[attr] and assign the output |
| 2169 | + to a new attribute name outname. The returned record array is |
| 2170 | + identical to r, with extra arrays for each element in summaryfuncs |
| 2171 | + """ |
| 2172 | + |
| 2173 | + names = list(r.dtype.names) |
| 2174 | + arrays = [r[name] for name in names] |
| 2175 | + |
| 2176 | + for attr, func, outname in summaryfuncs: |
| 2177 | + names.append(outname) |
| 2178 | + arrays.append(npy.asarray(func(r[attr]))) |
| 2179 | + |
| 2180 | + return npy.rec.fromarrays(arrays, names=names) |
| 2181 | + |
| 2182 | +def rec_join(key, r1, r2): |
| 2183 | + """ |
| 2184 | + join record arrays r1 and r2 on key; key is a tuple of field |
| 2185 | + names. if r1 and r2 have equal values on all the keys in the key |
| 2186 | + tuple, then their fields will be merged into a new record array |
| 2187 | + containing the intersection of the fields of r1 and r2 |
| 2188 | + """ |
| 2189 | + |
| 2190 | + for name in key: |
| 2191 | + if name not in r1.dtype.names: |
| 2192 | + raise ValueError('r1 does not have key field %s'%name) |
| 2193 | + if name not in r2.dtype.names: |
| 2194 | + raise ValueError('r2 does not have key field %s'%name) |
| 2195 | + |
| 2196 | + def makekey(row): |
| 2197 | + return tuple([row[name] for name in key]) |
| 2198 | + |
| 2199 | + r1d = dict([(makekey(row),i) for i,row in enumerate(r1)]) |
| 2200 | + r2d = dict([(makekey(row),i) for i,row in enumerate(r2)]) |
| 2201 | + |
| 2202 | + r1keys = set(r1d.keys()) |
| 2203 | + r2keys = set(r2d.keys()) |
| 2204 | + |
| 2205 | + keys = r1keys & r2keys |
| 2206 | + |
| 2207 | + r1ind = npy.array([r1d[k] for k in keys]) |
| 2208 | + r2ind = npy.array([r2d[k] for k in keys]) |
| 2209 | + |
| 2210 | + # Make sure that the output rows have the same relative order as r1 |
| 2211 | + sortind = r1ind.argsort() |
| 2212 | + |
| 2213 | + r1 = r1[r1ind[sortind]] |
| 2214 | + r2 = r2[r2ind[sortind]] |
| 2215 | + |
| 2216 | + r2 = rec_drop_fields(r2, r1.dtype.names) |
| 2217 | + |
| 2218 | + |
| 2219 | + def key_desc(name): |
| 2220 | + 'if name is a string key, use the larger size of r1 or r2 before merging' |
| 2221 | + dt1 = r1.dtype[name] |
| 2222 | + if dt1.type != npy.string_: |
| 2223 | + return (name, dt1.descr[0][1]) |
| 2224 | + |
| 2225 | + dt2 = r1.dtype[name] |
| 2226 | + assert dt2==dt1 |
| 2227 | + if dt1.num>dt2.num: |
| 2228 | + return (name, dt1.descr[0][1]) |
| 2229 | + else: |
| 2230 | + return (name, dt2.descr[0][1]) |
| 2231 | + |
| 2232 | + |
| 2233 | + |
| 2234 | + keydesc = [key_desc(name) for name in key] |
| 2235 | + |
| 2236 | + newdtype = npy.dtype(keydesc + |
| 2237 | + [desc for desc in r1.dtype.descr if desc[0] not in key ] + |
| 2238 | + [desc for desc in r2.dtype.descr if desc[0] not in key ] ) |
| 2239 | + |
| 2240 | + |
| 2241 | + newrec = npy.empty(len(r1), dtype=newdtype) |
| 2242 | + for field in r1.dtype.names: |
| 2243 | + newrec[field] = r1[field] |
| 2244 | + |
| 2245 | + for field in r2.dtype.names: |
| 2246 | + newrec[field] = r2[field] |
| 2247 | + |
| 2248 | + return newrec.view(npy.recarray) |
| 2249 | + |
2116 | 2250 | def csv2rec(fname, comments='#', skiprows=0, checkrows=0, delimiter=',', |
2117 | 2251 | converterd=None, names=None, missing=None): |
2118 | 2252 | """ |
@@ -2499,7 +2633,6 @@ def format(item, just_pad_prec_spacer): |
2499 | 2633 | return text |
2500 | 2634 |
|
2501 | 2635 |
|
2502 | | - |
2503 | 2636 | def rec2csv(r, fname, delimiter=',', formatd=None): |
2504 | 2637 | """ |
2505 | 2638 | Save the data from numpy record array r into a comma/space/tab |
|
0 commit comments