Thanks to visit codestin.com
Credit goes to github.com

Skip to content

BUG: Fix argsort vs sort in Masked arrays #8678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/release/1.13.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,14 @@ All of the following functions in ``np.linalg`` now work when given input
arrays with a 0 in the last two dimensions: `det``, ``slogdet``, ``pinv``,
``eigvals``, ``eigvalsh``, ``eig``, ``eigh``.

``argsort`` on masked arrays takes the same default arguments as ``sort``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
By default, ``argsort`` now places the masked values at the end of the sorted
array, in the same way that ``sort`` already did. Additionally, the
``end_with`` argument is added to ``argsort``, for consistency with ``sort``.
Note that this argument is not added at the end, so breaks any code that
passed ``fill_value`` as a positional argument.

Changes
=======

Expand Down
12 changes: 12 additions & 0 deletions numpy/lib/tests/test_arraysetops.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,18 @@ def test_unique_axis(self):
result = np.array([[-0.0, 0.0]])
assert_array_equal(unique(data, axis=0), result, msg)

def test_unique_masked(self):
# issue 8664
x = np.array([64, 0, 1, 2, 3, 63, 63, 0, 0, 0, 1, 2, 0, 63, 0], dtype='uint8')
y = np.ma.masked_equal(x, 0)

v = np.unique(y)
v2, i, c = np.unique(y, return_index=True, return_counts=True)

msg = 'Unique returned different results when asked for index'
assert_array_equal(v.data, v2.data, msg)
assert_array_equal(v.mask, v2.mask, msg)

def _run_axis_tests(self, dtype):
data = np.array([[0, 1, 0, 0],
[1, 0, 0, 0],
Expand Down
132 changes: 59 additions & 73 deletions numpy/ma/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5230,7 +5230,8 @@ def round(self, decimals=0, out=None):
out.__setmask__(self._mask)
return out

def argsort(self, axis=None, kind='quicksort', order=None, fill_value=None):
def argsort(self, axis=None, kind='quicksort', order=None,
endwith=True, fill_value=None):
"""
Return an ndarray of indices that sort the array along the
specified axis. Masked values are filled beforehand to
Expand All @@ -5241,15 +5242,21 @@ def argsort(self, axis=None, kind='quicksort', order=None, fill_value=None):
axis : int, optional
Axis along which to sort. The default is -1 (last axis).
If None, the flattened array is used.
fill_value : var, optional
Value used to fill the array before sorting.
The default is the `fill_value` attribute of the input array.
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
Sorting algorithm.
order : list, optional
When `a` is an array with fields defined, this argument specifies
which fields to compare first, second, etc. Not all fields need be
specified.
endwith : {True, False}, optional
Whether missing values (if any) should be treated as the largest values
(True) or the smallest values (False)
When the array contains unmasked values at the same extremes of the
datatype, the ordering of these values and the masked values is
undefined.
fill_value : {var}, optional
Value used internally for the masked values.
If ``fill_value`` is not None, it supersedes ``endwith``.

Returns
-------
Expand All @@ -5259,7 +5266,7 @@ def argsort(self, axis=None, kind='quicksort', order=None, fill_value=None):

See Also
--------
sort : Describes sorting algorithms used.
MaskedArray.sort : Describes sorting algorithms used.
lexsort : Indirect stable sort with multiple keys.
ndarray.sort : Inplace sort.

Expand All @@ -5278,10 +5285,19 @@ def argsort(self, axis=None, kind='quicksort', order=None, fill_value=None):
array([1, 0, 2])

"""

if fill_value is None:
fill_value = default_fill_value(self)
d = self.filled(fill_value).view(ndarray)
return d.argsort(axis=axis, kind=kind, order=order)
if endwith:
# nan > inf
if np.issubdtype(self.dtype, np.floating):
fill_value = np.nan
else:
fill_value = minimum_fill_value(self)
else:
fill_value = maximum_fill_value(self)

filled = self.filled(fill_value)
return filled.argsort(axis=axis, kind=kind, order=order)

def argmin(self, axis=None, fill_value=None, out=None):
"""
Expand Down Expand Up @@ -5380,12 +5396,11 @@ def sort(self, axis=-1, kind='quicksort', order=None,
to compare first, second, and so on. This list does not need to
include all of the fields.
endwith : {True, False}, optional
Whether missing values (if any) should be forced in the upper indices
(at the end of the array) (True) or lower indices (at the beginning).
When the array contains unmasked values of the largest (or smallest if
False) representable value of the datatype the ordering of these values
and the masked values is undefined. To enforce the masked values are
at the end (beginning) in this case one must sort the mask.
Whether missing values (if any) should be treated as the largest values
(True) or the smallest values (False)
When the array contains unmasked values at the same extremes of the
datatype, the ordering of these values and the masked values is
undefined.
fill_value : {var}, optional
Value used internally for the masked values.
If ``fill_value`` is not None, it supersedes ``endwith``.
Expand Down Expand Up @@ -5429,35 +5444,22 @@ def sort(self, axis=-1, kind='quicksort', order=None,
"""
if self._mask is nomask:
ndarray.sort(self, axis=axis, kind=kind, order=order)
return

if self is masked:
return

sidx = self.argsort(axis=axis, kind=kind, order=order,
fill_value=fill_value, endwith=endwith)

# save memory for 1d arrays
if self.ndim == 1:
idx = sidx
else:
if self is masked:
return self
if fill_value is None:
if endwith:
# nan > inf
if np.issubdtype(self.dtype, np.floating):
filler = np.nan
else:
filler = minimum_fill_value(self)
else:
filler = maximum_fill_value(self)
else:
filler = fill_value
idx = list(np.ix_(*[np.arange(x) for x in self.shape]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really like these shortcuts very much, the explicit meshgrid is easier to read imo
there isn't really an advantage to using ix_ is there?

Copy link
Member Author

@eric-wieser eric-wieser Feb 26, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really a shortcut though? The two functions are independant, with neither calling the other. Meshgrid doesn't do what we need by default, but ix_ does.

Furthermore, ix_ is used as an indexer in its examples, whereas meshgrid is used to evaluate functions. The former use is what we want here.

idx[axis] = sidx

sidx = self.filled(filler).argsort(axis=axis, kind=kind,
order=order)
# save meshgrid memory for 1d arrays
if self.ndim == 1:
idx = sidx
else:
idx = np.meshgrid(*[np.arange(x) for x in self.shape], sparse=True,
indexing='ij')
idx[axis] = sidx
tmp_mask = self._mask[idx].flat
tmp_data = self._data[idx].flat
self._data.flat = tmp_data
self._mask.flat = tmp_mask
return
self[...] = self[idx]
Copy link
Member Author

@eric-wieser eric-wieser Feb 23, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory we could do a little better here, by implementing the in-place sort in terms of the non-in place. Right now (and before this patch), the latter does a redundant copy. So either way, best left for another PR, I think


def min(self, axis=None, out=None, fill_value=None, keepdims=np._NoValue):
"""
Expand Down Expand Up @@ -6500,49 +6502,33 @@ def power(a, b, third=None):
result._data[invalid] = result.fill_value
return result


def argsort(a, axis=None, kind='quicksort', order=None, fill_value=None):
"Function version of the eponymous method."
if fill_value is None:
fill_value = default_fill_value(a)
d = filled(a, fill_value)
if axis is None:
return d.argsort(kind=kind, order=order)
return d.argsort(axis, kind=kind, order=order)
argsort.__doc__ = MaskedArray.argsort.__doc__

argmin = _frommethod('argmin')
argmax = _frommethod('argmax')

def argsort(a, axis=None, kind='quicksort', order=None, endwith=True, fill_value=None):
"Function version of the eponymous method."
a = np.asanyarray(a)

if isinstance(a, MaskedArray):
return a.argsort(axis=axis, kind=kind, order=order,
endwith=endwith, fill_value=fill_value)
else:
return a.argsort(axis=axis, kind=kind, order=order)
argsort.__doc__ = MaskedArray.argsort.__doc__

def sort(a, axis=-1, kind='quicksort', order=None, endwith=True, fill_value=None):
"Function version of the eponymous method."
a = narray(a, copy=True, subok=True)
a = np.array(a, copy=True, subok=True)
if axis is None:
a = a.flatten()
axis = 0
if fill_value is None:
if endwith:
# nan > inf
if np.issubdtype(a.dtype, np.floating):
filler = np.nan
else:
filler = minimum_fill_value(a)
else:
filler = maximum_fill_value(a)
else:
filler = fill_value

sindx = filled(a, filler).argsort(axis=axis, kind=kind, order=order)

# save meshgrid memory for 1d arrays
if a.ndim == 1:
indx = sindx
if isinstance(a, MaskedArray):
a.sort(axis=axis, kind=kind, order=order,
endwith=endwith, fill_value=fill_value)
else:
indx = np.meshgrid(*[np.arange(x) for x in a.shape], sparse=True,
indexing='ij')
indx[axis] = sindx
return a[indx]
a.sort(axis=axis, kind=kind, order=order)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a choice we need to make here. Right now (and before this patch), np.ma.sort sometimes returns an ma.array, and sometimes an ndarray.

Should we change it to always promote to ma.array, for consistency with the other functions? (Which do this as of #8665 )

return a
sort.__doc__ = MaskedArray.sort.__doc__


Expand Down
14 changes: 14 additions & 0 deletions numpy/ma/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3031,6 +3031,20 @@ def test_sort(self):
assert_equal(sortedx._data, [1, 2, -2, -1, 0])
assert_equal(sortedx._mask, [1, 1, 0, 0, 0])

def test_argsort_matches_sort(self):
x = array([1, 4, 2, 3], mask=[0, 1, 0, 0], dtype=np.uint8)

for kwargs in [dict(),
dict(endwith=True),
dict(endwith=False),
dict(fill_value=2),
dict(fill_value=2, endwith=True),
dict(fill_value=2, endwith=False)]:
sortedx = sort(x, **kwargs)
argsortedx = x[argsort(x, **kwargs)]
assert_equal(sortedx._data, argsortedx._data)
assert_equal(sortedx._mask, argsortedx._mask)

def test_sort_2d(self):
# Check sort of 2D array.
# 2D array w/o mask
Expand Down