diff --git a/doc/source/release.rst b/doc/source/release.rst index e6de0186acb24..dd72a1445b816 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -105,6 +105,8 @@ API Changes - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`) +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`). Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -125,6 +127,7 @@ Improvements to existing features - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) - Testing statements updated to use specialized asserts (:issue:`6175`) - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) +- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - perf improvements in DataFrame construction with certain offsets, by removing faulty caching @@ -191,6 +194,10 @@ Bug Fixes - Bug in ``read_html`` tests where redirected invalid URLs would make one test fail (:issue:`6445`). - Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) +- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) +- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases (:issue:`6335`) +- Bug in ``DataFrame.to_stata`` which exported using he wrong data types and missing values (:issue:`6335`) + pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ada29dc674420..462351860c15b 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -78,6 +78,21 @@ These are out-of-bounds selections - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`) + + .. ipython:: python + + i = pd.Index([1, 2, 3, 'a' , 'b', 'c']) + i[[0,1,2]] + + Previously, the above operation would return ``Int64Index``. If you'd like + to do this manually, use :meth:`Index.astype` + + .. ipython:: python + + i[[0,1,2]].astype(np.int_) + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -233,6 +248,9 @@ Enhancements using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex`` to convert to the Julian Date used primarily in astronomy. (:issue:`4041`) +- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types + and will upcast when needed. When it isn't possibly to losslessly upcast, a warning + is raised (:issue:`6327`) Performance ~~~~~~~~~~~ diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 7f406611c82f7..14c9ec2f3355d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -68,12 +68,14 @@ cdef: int TIEBREAK_MAX = 2 int TIEBREAK_FIRST = 3 int TIEBREAK_FIRST_DESCENDING = 4 + int TIEBREAK_DENSE = 5 tiebreakers = { 'average' : TIEBREAK_AVERAGE, 'min' : TIEBREAK_MIN, 'max' : TIEBREAK_MAX, - 'first' : TIEBREAK_FIRST + 'first' : TIEBREAK_FIRST, + 'dense' : TIEBREAK_DENSE, } @@ -137,7 +139,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] sorted_data, ranks, values ndarray[int64_t] argsorted float64_t val, nan_value @@ -200,6 +202,10 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -214,7 +220,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[int64_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted @@ -265,6 +271,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -279,7 +289,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks, values ndarray[int64_t, ndim=2] argsorted float64_t val, nan_value @@ -324,6 +334,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -347,6 +358,10 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -362,7 +377,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[int64_t, ndim=2] argsorted ndarray[int64_t, ndim=2, cast=True] values @@ -395,6 +410,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -415,6 +431,10 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -430,7 +450,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] ranks ndarray sorted_data, values ndarray[int64_t] argsorted @@ -502,6 +522,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: ranks / count @@ -545,6 +569,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, infs, dups = 0 + Py_ssize_t total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[object, ndim=2] values ndarray[int64_t, ndim=2] argsorted @@ -600,6 +625,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = infs = 0 + total_tie_count = 0 for j in range(k): val = values[i, j] if val is nan_value and keep_na: @@ -621,6 +647,10 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for ' 'non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 228fa1fd08a5f..6c1037f018e02 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4182,11 +4182,12 @@ def rank(self, axis=0, numeric_only=None, method='average', Ranks over columns (0) or rows (1) numeric_only : boolean, default None Include only float, int, boolean data - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep', 'top', 'bottom'} * keep: leave NA values where they are * top: smallest rank if ascending diff --git a/pandas/core/index.py b/pandas/core/index.py index 4a4086c4eeb0c..c16e2eff06904 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -631,34 +631,35 @@ def __hash__(self): raise TypeError("unhashable type: %r" % type(self).__name__) def __getitem__(self, key): - """Override numpy.ndarray's __getitem__ method to work as desired""" - arr_idx = self.view(np.ndarray) + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + __getitem__ = super(Index, self).__getitem__ if np.isscalar(key): - return arr_idx[key] - else: - if com._is_bool_indexer(key): - key = np.asarray(key) + return __getitem__(key) - try: - result = arr_idx[key] - if result.ndim > 1: - return result - except (IndexError): - if not len(key): - result = [] - else: - raise + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return __getitem__(key) - return Index(result, name=self.name) + if com._is_bool_indexer(key): + return __getitem__(np.asarray(key)) - def _getitem_slice(self, key): - """ getitem for a bool/sliceable, fallback to standard getitem """ - try: - arr_idx = self.view(np.ndarray) - result = arr_idx[key] - return self.__class__(result, name=self.name, fastpath=True) - except: - return self.__getitem__(key) + result = __getitem__(key) + if result.ndim > 1: + return result.view(np.ndarray) + else: + return result def append(self, other): """ @@ -2800,8 +2801,6 @@ def __getitem__(self, key): return result - _getitem_slice = __getitem__ - def take(self, indexer, axis=None): """ Analogous to ndarray.take diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 10017f89e5204..74a8ce0118d88 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False): if raise_on_error: _check_slice_bounds(slobj, self.index) return self.__class__(self._block._slice(slobj), - self.index._getitem_slice(slobj), fastpath=True) + self.index[slobj], fastpath=True) def set_axis(self, axis, value, maybe_rename=True, check_axis=True): cur_axis, value = self._set_axis(axis, value, check_axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5d6115b0e4ef9..9e6c0bd9305ab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1720,11 +1720,12 @@ def rank(self, method='average', na_option='keep', ascending=True, Parameters ---------- - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep'} keep: leave NA values where they are ascending : boolean, default True diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..2ecdb22a5cc7b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,7 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip from pandas import isnull from pandas.io.common import get_filepath_or_buffer - +from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): @@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, return reader.data(convert_dates, convert_categoricals, index) -_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] def _stata_elapsed_date_to_datetime(date, fmt): @@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): # numpy types and numpy datetime isn't mature enough / we can't rely on # pandas version > 0.7.1 #TODO: IIRC relative delta doesn't play well with np.datetime? + #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return np.datetime64('nat') @@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") return date - elif fmt in ["%td", "td"]: + elif fmt in ["%td", "td", "%d", "d"]: return stata_epoch + datetime.timedelta(int(date)) elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) @@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt): if not isinstance(date, datetime.datetime): raise ValueError("date should be datetime.datetime format") stata_epoch = datetime.datetime(1960, 1, 1) + # Handle NaTs + if date is NaT: + # Missing value for dates ('.'), assumed always double + # TODO: Should be moved so a const somewhere, and consolidated + return struct.unpack(' 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype == np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype == np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) + + return data + + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -193,14 +255,23 @@ class StataMissingValue(StringMixin): ----- More information: """ - + # TODO: Needs test def __init__(self, offset, value): self._value = value - if type(value) is int or type(value) is long: - self._str = value - offset is 1 and \ - '.' or ('.' + chr(value - offset + 96)) + value_type = type(value) + if value_type in int: + loc = value - offset + elif value_type in (float, np.float32, np.float64): + if value <= np.finfo(np.float32).max: # float32 + conv_str, byte_loc, scale = ' nmax: if self._missing_values: return StataMissingValue(nmax, d) @@ -855,11 +942,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +966,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -970,7 +1060,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1080,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatbaility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data @@ -1181,7 +1274,7 @@ def _write_data_dates(self): self._write(var) else: if isnull(var): # this only matters for floats - var = MISSING_VALUES[typ] + var = MISSING_VALUES[TYPE_MAP[typ]] self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) def _null_terminate(self, s, as_string=False): diff --git a/pandas/io/tests/data/stata1.dta b/pandas/io/tests/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1.dta rename to pandas/io/tests/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_v13.dta b/pandas/io/tests/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_v13.dta rename to pandas/io/tests/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata2_113.dta b/pandas/io/tests/data/stata2_113.dta new file mode 100644 index 0000000000000..09c90dca943d1 Binary files /dev/null and b/pandas/io/tests/data/stata2_113.dta differ diff --git a/pandas/io/tests/data/stata2.dta b/pandas/io/tests/data/stata2_114.dta similarity index 100% rename from pandas/io/tests/data/stata2.dta rename to pandas/io/tests/data/stata2_114.dta diff --git a/pandas/io/tests/data/stata2_115.dta b/pandas/io/tests/data/stata2_115.dta new file mode 100644 index 0000000000000..ad7dda3fdc4b3 Binary files /dev/null and b/pandas/io/tests/data/stata2_115.dta differ diff --git a/pandas/io/tests/data/stata2_v13.dta b/pandas/io/tests/data/stata2_117.dta similarity index 100% rename from pandas/io/tests/data/stata2_v13.dta rename to pandas/io/tests/data/stata2_117.dta diff --git a/pandas/io/tests/data/stata3_113.dta b/pandas/io/tests/data/stata3_113.dta new file mode 100644 index 0000000000000..f78150a2e4326 Binary files /dev/null and b/pandas/io/tests/data/stata3_113.dta differ diff --git a/pandas/io/tests/data/stata3.dta b/pandas/io/tests/data/stata3_114.dta similarity index 100% rename from pandas/io/tests/data/stata3.dta rename to pandas/io/tests/data/stata3_114.dta diff --git a/pandas/io/tests/data/stata3_115.dta b/pandas/io/tests/data/stata3_115.dta new file mode 100644 index 0000000000000..1c4ad0dae8092 Binary files /dev/null and b/pandas/io/tests/data/stata3_115.dta differ diff --git a/pandas/io/tests/data/stata3_v13.dta b/pandas/io/tests/data/stata3_117.dta similarity index 100% rename from pandas/io/tests/data/stata3_v13.dta rename to pandas/io/tests/data/stata3_117.dta diff --git a/pandas/io/tests/data/stata4_113.dta b/pandas/io/tests/data/stata4_113.dta new file mode 100644 index 0000000000000..9d7d5abb1b921 Binary files /dev/null and b/pandas/io/tests/data/stata4_113.dta differ diff --git a/pandas/io/tests/data/stata4.dta b/pandas/io/tests/data/stata4_114.dta similarity index 100% rename from pandas/io/tests/data/stata4.dta rename to pandas/io/tests/data/stata4_114.dta diff --git a/pandas/io/tests/data/stata4_115.dta b/pandas/io/tests/data/stata4_115.dta new file mode 100644 index 0000000000000..2c68cfb393b9e Binary files /dev/null and b/pandas/io/tests/data/stata4_115.dta differ diff --git a/pandas/io/tests/data/stata4_v13.dta b/pandas/io/tests/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_v13.dta rename to pandas/io/tests/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv new file mode 100644 index 0000000000000..218380c99593c --- /dev/null +++ b/pandas/io/tests/data/stata5.csv @@ -0,0 +1,13 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,,"a","a" +1,1,1,1,1,,"ab","b" +-1,-1,-1,-1,-1,,"abc","c" +100,32740,-2147483647,-1.70100000027769e+38,-2.0000000000000e+307,1970-01-01,"abcdefghijklmnop","d" +-127,-32767,2147483620,1.70100000027769e+38,8.0000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,2014-01-01,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,2114-01-01,"1234567890","1" +,,0,,,2014-12-31,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,2012-02-29,"!","A" +.z,.z,.z,.z,.z,,"&","Z" +,,,0,,,"1.23","!" +,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/io/tests/data/stata5_113.dta new file mode 100644 index 0000000000000..3615928d55838 Binary files /dev/null and b/pandas/io/tests/data/stata5_113.dta differ diff --git a/pandas/io/tests/data/stata5_114.dta b/pandas/io/tests/data/stata5_114.dta new file mode 100644 index 0000000000000..bebc6a72c5e34 Binary files /dev/null and b/pandas/io/tests/data/stata5_114.dta differ diff --git a/pandas/io/tests/data/stata5_115.dta b/pandas/io/tests/data/stata5_115.dta new file mode 100644 index 0000000000000..c54bd62c24dd2 Binary files /dev/null and b/pandas/io/tests/data/stata5_115.dta differ diff --git a/pandas/io/tests/data/stata6.csv b/pandas/io/tests/data/stata6.csv new file mode 100644 index 0000000000000..27a1dc64f530b --- /dev/null +++ b/pandas/io/tests/data/stata6.csv @@ -0,0 +1,6 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,1960-01-01,"a","a" +1,1,1,1,1,3014-12-31,"ab","b" +-1,-1,-1,-1,-1,2014-12-31,"abc","c" +100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d" +-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/io/tests/data/stata6_113.dta new file mode 100644 index 0000000000000..2e4795b167f26 Binary files /dev/null and b/pandas/io/tests/data/stata6_113.dta differ diff --git a/pandas/io/tests/data/stata6_114.dta b/pandas/io/tests/data/stata6_114.dta new file mode 100644 index 0000000000000..aa507e474dd43 Binary files /dev/null and b/pandas/io/tests/data/stata6_114.dta differ diff --git a/pandas/io/tests/data/stata6_115.dta b/pandas/io/tests/data/stata6_115.dta new file mode 100644 index 0000000000000..c513463868113 Binary files /dev/null and b/pandas/io/tests/data/stata6_115.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..f3cc43e6e3fb0 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -27,22 +27,46 @@ def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() - self.dta1 = os.path.join(self.dirpath, 'stata1.dta') - self.dta2 = os.path.join(self.dirpath, 'stata2.dta') - self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') + self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') + self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') + self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') + self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') + self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') + self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') + self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') - self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') + self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') + self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') + self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta1_13 = os.path.join(self.dirpath, 'stata1_v13.dta') - self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') - self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') - self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') + self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') + self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') + self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') + self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') + self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') + self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -51,10 +75,10 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) def test_read_dta1(self): - reader = StataReader(self.dta1) - parsed = reader.data() - reader_13 = StataReader(self.dta1_13) - parsed_13 = reader_13.data() + reader_114 = StataReader(self.dta1_114) + parsed_114 = reader_114.data() + reader_117 = StataReader(self.dta1_117) + parsed_117 = reader_117.data() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], @@ -65,8 +89,8 @@ def test_read_dta1(self): # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': @@ -109,34 +133,48 @@ def test_read_dta2(self): 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] ) + expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: - parsed = self.read_dta(self.dta2) - parsed_13 = self.read_dta(self.dta2_13) + parsed_114 = self.read_dta(self.dta2_114) + parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due ot limits date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - # - #tm.assert_frame_equal(parsed, expected) - #tm.assert_frame_equal(parsed_13, expected) + # Format 113 test fails since it does not support tc and tC formats + # tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta3(self): - parsed = self.read_dta(self.dta3) - parsed_13 = self.read_dta(self.dta3_13) + parsed_113 = self.read_dta(self.dta3_113) + parsed_114 = self.read_dta(self.dta3_114) + parsed_115 = self.read_dta(self.dta3_115) + parsed_117 = self.read_dta(self.dta3_117) # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta4(self): - parsed = self.read_dta(self.dta4) - parsed_13 = self.read_dta(self.dta4_13) + parsed_113 = self.read_dta(self.dta4_113) + parsed_114 = self.read_dta(self.dta4_114) + parsed_115 = self.read_dta(self.dta4_115) + parsed_117 = self.read_dta(self.dta4_117) + expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], @@ -153,11 +191,13 @@ def test_read_dta4(self): columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_write_dta5(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', @@ -171,10 +211,13 @@ def test_read_write_dta5(self): original) def test_write_dta6(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -201,7 +244,7 @@ def test_read_dta9(self): tm.assert_frame_equal(parsed, expected) def test_read_write_dta10(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], @@ -209,6 +252,8 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -238,13 +283,14 @@ def test_encoding(self): self.assert_(isinstance(result, unicode)) def test_read_write_dta11(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -256,13 +302,14 @@ def test_read_write_dta11(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) def test_read_write_dta12(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -272,6 +319,63 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9, dtype=np.int16) + s2 = Series(2**17, dtype=np.int32) + s3 = Series(2**33, dtype=np.int64) + original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3}) + original.index.name = 'index' + + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + formatted) + + def test_read_write_reread_dta14(self): + expected = self.read_csv(self.csv14) + cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] + for col in cols: + expected[col] = expected[col].convert_objects(convert_numeric=True) + expected['float_'] = expected['float_'].astype(np.float32) + expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True) + + parsed_113 = self.read_dta(self.dta14_113) + parsed_113.index.name = 'index' + parsed_114 = self.read_dta(self.dta14_114) + parsed_114.index.name = 'index' + parsed_115 = self.read_dta(self.dta14_115) + parsed_115.index.name = 'index' + + tm.assert_frame_equal(parsed_114, parsed_113) + tm.assert_frame_equal(parsed_114, parsed_115) + + with tm.ensure_clean() as path: + parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) + + def test_read_write_reread_dta15(self): + expected = self.read_csv(self.csv15) + expected['byte_'] = expected['byte_'].astype(np.int8) + expected['int_'] = expected['int_'].astype(np.int16) + expected['long_'] = expected['long_'].astype(np.int32) + expected['float_'] = expected['float_'].astype(np.float32) + expected['double_'] = expected['double_'].astype(np.float64) + expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + + parsed_113 = self.read_dta(self.dta15_113) + parsed_114 = self.read_dta(self.dta15_114) + parsed_115 = self.read_dta(self.dta15_115) + + tm.assert_frame_equal(expected, parsed_114) + tm.assert_frame_equal(parsed_113, parsed_114) + tm.assert_frame_equal(parsed_114, parsed_115) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index e828bc100dfcf..3e578a5e36bb1 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -323,6 +323,25 @@ def test_fancy(self): for i in sl: self.assertEqual(i, sl[sl.get_loc(i)]) + def test_empty_fancy(self): + empty_farr = np.array([], dtype=np.float_) + empty_iarr = np.array([], dtype=np.int_) + empty_barr = np.array([], dtype=np.bool_) + + # pd.DatetimeIndex is excluded, because it overrides getitem and should + # be tested separately. + for idx in [self.strIndex, self.intIndex, self.floatIndex]: + empty_idx = idx.__class__([]) + values = idx.values + + self.assert_(idx[[]].identical(empty_idx)) + self.assert_(idx[empty_iarr].identical(empty_idx)) + self.assert_(idx[empty_barr].identical(empty_idx)) + + # np.ndarray only accepts ndarray of int & bool dtypes, so should + # Index. + self.assertRaises(IndexError, idx.__getitem__, empty_farr) + def test_getitem(self): arr = np.array(self.dateIndex) exp = self.dateIndex[5] @@ -762,6 +781,14 @@ def test_join_self(self): joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_indexing_doesnt_change_class(self): + idx = Index([1, 2, 3, 'a', 'b', 'c']) + + self.assert_(idx[1:3].identical( + pd.Index([2, 3], dtype=np.object_))) + self.assert_(idx[[0,1]].identical( + pd.Index([1, 2], dtype=np.object_))) + class TestFloat64Index(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 7e2144e801122..cb3fdcafd4056 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -12,7 +12,6 @@ assert_almost_equal) import pandas.util.testing as tm - class TestRank(tm.TestCase): _multiprocess_can_split_ = True s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) @@ -23,7 +22,8 @@ class TestRank(tm.TestCase): 3.5, 1.5, 8.0, nan, 5.5]), 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } def test_rank_tie_methods(self): @@ -43,6 +43,24 @@ def _check(s, expected, method='average'): series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) + def test_rank_dense_method(self): + dtypes = ['O', 'f8', 'i8'] + in_out = [([1], [1]), + ([2], [1]), + ([0], [1]), + ([2,2], [1,1]), + ([1,2,3], [1,2,3]), + ([4,2,1], [3,2,1],), + ([1,1,5,5,3], [1,1,3,3,2]), + ([-5,-4,-3,-2,-1], [1,2,3,4,5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense') + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + def test_rank_descending(self): dtypes = ['O', 'f8', 'i8'] diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f81634f45bdb2..c58447acec621 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1406,8 +1406,6 @@ def __getitem__(self, key): return self._simple_new(result, self.name, new_offset, self.tz) - _getitem_slice = __getitem__ - # Try to run function on index first, and then on elements of index # Especially important for group-by functionality def map(self, f): diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 337533ad29f4f..5fca119c14e83 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1056,8 +1056,6 @@ def __getitem__(self, key): return PeriodIndex(result, name=self.name, freq=self.freq) - _getitem_slice = __getitem__ - def _format_with_header(self, header, **kwargs): return header + self._format_native_types(**kwargs) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 8b348ddc6e6cc..2cfdffdc38541 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -46,3 +46,16 @@ index_int64_intersection = Benchmark('left.intersection(right)', setup, start_date=datetime(2011, 1, 1)) + +#---------------------------------------------------------------------- +# string index slicing +setup = common_setup + """ +idx = tm.makeStringIndex(1000000) + +mask = np.arange(1000000) % 3 == 0 +series_mask = Series(mask) +""" +index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup) +index_str_slice_indexer_even = Benchmark('idx[::2]', setup) +index_str_boolean_indexer = Benchmark('idx[mask]', setup) +index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup)