Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[WIP] : Added assert_consistent_docs() and related tests #10323

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
20 changes: 19 additions & 1 deletion sklearn/metrics/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from sklearn.utils.testing import assert_no_warnings
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_not_equal
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import if_numpydoc, ignore_warnings
from sklearn.utils.testing import assert_consistent_docs
from sklearn.utils.mocking import MockDataFrame

from sklearn.metrics import accuracy_score
Expand Down Expand Up @@ -1628,3 +1629,20 @@ def test_brier_score_loss():
# calculate even if only single class in y_true (#6980)
assert_almost_equal(brier_score_loss([0], [0.5]), 0.25)
assert_almost_equal(brier_score_loss([1], [0.5]), 0.25)


@if_numpydoc
def test_docstring():
# Test for consistency among docstring of different metrics
assert_consistent_docs([precision_recall_fscore_support, precision_score,
recall_score, f1_score, fbeta_score],
include_returns=False,
exclude_params=['labels', 'average', 'beta'])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not beta?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we want to test average for precision_score, recall_score, f1_score, fbeta_score. Can use a separate assertion, I suppose.

Copy link
Contributor Author

@amanp10 amanp10 Feb 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have inconsistency in parameter beta.
For precision_recall_fscore_support,
beta : float, 1.0 by default
The strength of recall versus precision in the F-score.

For fbeta_score,
beta : float
Weight of precision in harmonic mean.


error_str = ("Parameter 'labels' of 'precision_score' has inconsistent "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume we don't want this inconsistency to exist? The docs should be fixed then.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In precision_score description there seems to be an addition,
.. versionchanged:: 0.17 parameter *labels* improved for multiclass problem.
Should this be added in precision_recall_fscore_support? If yes then would this PR be appropriate?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose so.

"description with that of 'precision_recall_fscore_support'.")
assert_raise_message(AssertionError, error_str, assert_consistent_docs,
[precision_recall_fscore_support, precision_score,
recall_score, f1_score, fbeta_score],
include_returns=False,
exclude_params=['average', 'beta'])
179 changes: 179 additions & 0 deletions sklearn/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,27 @@ def run_test(*args, **kwargs):
return run_test


def if_numpydoc(func):
"""
Decorator to check if numpydoc is available and python version is
atleast 3.5.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • at least

Meant for testing docstrings.
"""
@wraps(func)
def run_test(*args, **kwargs):
try:
import numpydoc # noqa
numpydoc.docscrape.NumpyDocString("Test Docstring")
assert sys.version_info >= (3, 5)
except (ImportError, AssertionError):
raise SkipTest("numpydoc is required to test the docstrings, "
"as well as python version >= 3.5")
else:
return func(*args, **kwargs)

return run_test


def clean_warning_registry():
"""Safe way to reset warnings."""
warnings.resetwarnings()
Expand Down Expand Up @@ -882,3 +903,161 @@ def check_docstring_parameters(func, doc=None, ignore=None, class_name=None):
if n1 != n2:
incorrect += [func_name + ' ' + n1 + ' != ' + n2]
return incorrect


def _check_matching_docstrings(doc_list, type_dict, type_name, object_name,
include, exclude):
"""
Checks if the docstring element, Parameter/Attribute/Return, having the
same name as a key in ``type_dict``, also has the same type definition and
description as that in ``type_dict``.

If a matching key is not found in ``type_dict``, the docstring element is
added in it with it's name as the key and value being a dictionary of it's
type definition and description.

"""
for name, type_definition, description in doc_list:
if exclude is not None and name in exclude:
pass
elif include is not True and name not in include:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will raise TypeError if include=False

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are going with include and exclude validation in the very beginning, so it wont be necessary here.

pass
else:
# remove all whitespaces
type_definition = " ".join(type_definition.split())
description = [" ".join(s.split()) for s in description]
description = list(filter(None, description))

if name in type_dict:
u_dict = type_dict[name]
msg1 = (type_name + " '" + name + "' of '" + object_name +
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using one (.format) or another (%) kind of formatting string would be clearer

"' has inconsistent type definition with that of '" +
u_dict['object'] + "'.")
msg2 = (type_name + " '" + name + "' of '" + object_name +
"' has inconsistent description with that of '" +
u_dict['object'] + "'.")

assert u_dict['type_definition'] == type_definition, msg1
assert u_dict['description'] == description, msg2
else:
add_dict = {'type_definition': type_definition,
'description': description,
'object': object_name}
type_dict[name] = add_dict

return type_dict


def assert_consistent_docs(objects,
include_params=True, exclude_params=None,
include_attribs=True, exclude_attribs=None,
include_returns=True, exclude_returns=None):
"""
Checks consistency between the docstring of ``objects``.

Checks if types and descriptions of Parameters/Attributes/Returns are
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to clarify behaviour when one of the params/attribs/returns is present in one and not another. Do we just ignore it and only compare for all pairs where they are common? I think so, but this should be documented.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. We compare only those with the same name, else do nothing. I will document it.

identical across ``objects``. Raises AssertionError if found otherwise.

Parameters
----------
objects : collection
The collection (list, set etc.) of objects that may be either
``NumpyDocString`` instances or objects (classes, functions,
descriptors) with docstrings that can be parsed by numpydoc.

include_params : collection, False or True (default)
Collection of Parameters to be included. True, for including all
parameters.

exclude_params : collection or None (default)
Collection of Parameters to be excluded. Set only if
``include_params`` is True.

include_attribs : collection, False or True (default)
Collection of Attributes to be included. True, for including all
attributes.

exclude_attribs : collection or None (default)
Collection of Attributes to be excluded. Set only if
``include_attribs`` is True.

include_returns : collection, False or True (default)
Collection of Returns to be included. True, for including all returns.

exclude_returns : collection or None (default)
Collection of Returns to be excluded. Set only if ``include_returns``
is True.

Notes
-----
This function asserts that any Parameters/Returns/Attributes entries having
the same name among ``objects`` docstrings also have the same type
specification and description.
It compares only those entries having the same name in the docstring, does
no comparison if an entry in a docstring is unique.

Examples
--------
>>> from sklearn.metrics import (mean_absolute_error, mean_squared_error,
... median_absolute_error)
>>> from sklearn.utils.testing import assert_consistent_docs
... # doctest: +SKIP
>>> assert_consistent_docs([mean_absolute_error, mean_squared_error],
... include_params=['y_true', 'y_pred', 'sample_weight'],
... include_attribs=False, include_returns=False) # doctest: +SKIP
>>> assert_consistent_docs([median_absolute_error, mean_squared_error],
... include_params=True, include_attribs=False, include_returns=False)
... # doctest: +NORMALIZE_WHITESPACE, +SKIP
Traceback (most recent call last):
...
AssertionError: Parameter 'y_true' of 'mean_squared_error' has inconsistent
type definition with that of 'median_absolute_error'.

"""
if ((exclude_params and include_params is not True) or
(exclude_attribs and include_attribs is not True) or
(exclude_returns and include_returns is not True)):
raise TypeError("exclude_ argument can be set only if include_"
" argument is True.")

from numpydoc import docscrape

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should validate include and exclude make sense

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be helpful and do no harm, I think we should add it.

# Dictionary of all different Parameters/Attributes/Returns found
param_dict = {}
attrib_dict = {}
return_dict = {}

i = 1 # sequence of object in the collection
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use enumerate(objects, 1) instead

for u in objects:
if isinstance(u, docscrape.NumpyDocString):
doc = u
name = 'Object '+str(i)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

space around +. I think we should allow the user to pass in names somehow...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps objects can be (name, numpydocstring) pairs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be appropriate for numpydocstring objects.
So, now objects can be a callable (function,class etc.) or tuple of type (string, NumpyDocString). Am I right?

elif (inspect.isdatadescriptor(u) or inspect.isfunction(u) or
inspect.isclass(u)):
doc = docscrape.NumpyDocString(inspect.getdoc(u))
name = u.__name__
else:
raise TypeError("Object passed not a Function, Class, "
"Descriptor or NumpyDocString.")
i = i + 1

# check for inconsistency in Parameters
if include_params is not False:
param_dict = _check_matching_docstrings(doc['Parameters'],
param_dict, 'Parameter',
name, include_params,
exclude_params)

# check for inconsistency in Attributes
if include_attribs is not False:
attrib_dict = _check_matching_docstrings(doc['Attributes'],
attrib_dict, 'Attribute',
name, include_attribs,
exclude_attribs)

# check for inconsistency in Returns
if include_returns is not False:
return_dict = _check_matching_docstrings(doc['Returns'],
return_dict, 'Return',
name, include_returns,
exclude_returns)
148 changes: 147 additions & 1 deletion sklearn/utils/tests/test_testing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import warnings
import unittest
import inspect
import sys
import numpy as np
from scipy import sparse
Expand All @@ -16,14 +17,15 @@
assert_warns,
assert_no_warnings,
assert_equal,
assert_consistent_docs,
set_random_state,
assert_raise_message,
ignore_warnings,
check_docstring_parameters,
assert_allclose_dense_sparse,
assert_raises_regex)

from sklearn.utils.testing import SkipTest
from sklearn.utils.testing import if_numpydoc, SkipTest
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

Expand Down Expand Up @@ -491,3 +493,147 @@ def test_check_docstring_parameters():
'type definition for param: "c " (type definition was "")',
'sklearn.utils.tests.test_testing.f_check_param_definition There was '
'no space between the param name and colon ("d:int")'])


def func_doc1(y_true, y_pred, sample_weight):
"""Dummy function for docstring testing.

Parameters
----------
y_true : 1d array-like
Ground truth (correct) target values.

y_pred : 1d array-like
Estimated targets as returned by a classifier.

sample_weight : array-like of shape = [n_samples], optional
Sample weights.

Returns
-------
fbeta_score : float or array of float, shape = [n_unique_labels]
F-beta score of the positive class in binary classification.

precision : float or array of float, shape = [n_unique_labels]
Precision values.

recall : float or array of float, shape = [n_unique_labels]

"""
pass


def func_doc2(y_true, y_pred, sample_weight):
"""Dummy function for docstring testing.

Parameters
----------
y_true : 1d array-like
Ground truth (correct) target values.

y_pred : 1d array-like
Estimated targets as returned by a classifier.

sample_weight : array-like of shape = [n_samples], optional

Returns
-------
fbeta_score : float or array of float, shape = [n_unique_labels]
F-beta score of the positive class in binary classification.

precision : float or array of float
Precision values.

"""
pass


@if_numpydoc
def test_assert_consistent_docs():
# Testing invalid object type
assert_raises(TypeError, assert_consistent_docs, ["Object1", "Object2"])

# Testing with dummy functions
assert_consistent_docs([func_doc1, func_doc2],
include_params=['y_true', 'y_pred'],
include_returns=False)

from numpydoc import docscrape # using NumpyDocString object for tests

doc1 = docscrape.NumpyDocString(inspect.getdoc(func_doc1))
doc2 = docscrape.NumpyDocString(inspect.getdoc(func_doc2))

# Test error messages on mismatch
error_msg1 = ("Parameter 'sample_weight' of 'Object 2' has inconsistent"
" description with that of 'Object 1'.")
assert_raise_message(AssertionError, error_msg1, assert_consistent_docs,
[doc1, doc2], include_params=['sample_weight'],
include_returns=False) # description mismatch

error_msg2 = ("Return 'precision' of 'Object 2' has inconsistent type"
" definition with that of 'Object 1'.")
assert_raise_message(AssertionError, error_msg2, assert_consistent_docs,
[doc1, doc2], include_returns=['precision'],
include_params=False) # type definition mismatch

doc3 = doc1 # both doc1 and doc3 return 'recall' whereas doc2 does not
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a thought inspired by this: I wonder if we should raise an error/warning if an explicitly included name is only in one of the input docstrings...

Copy link
Contributor Author

@amanp10 amanp10 Feb 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure it would be very necessary. Also, if a name is present only in a few of the objects, maybe there should be a warning for that as well.

assert_consistent_docs([doc1, doc2, doc3],
include_returns=['fbeta_score', 'recall'],
include_attribs=False, include_params=False)

# Test for incorrect usage
test_list1 = doc1['Parameters']
test_list2 = doc2['Parameters']
doc1['Parameters'] = doc2['Parameters'] = []
doc1['Returns'] = doc2['Returns'] = []
type_list = ['Parameters', 'Attributes', 'Returns']
include_list = ['include_params', 'include_attribs', 'include_returns']
exclude_list = ['exclude_params', 'exclude_attribs', 'exclude_returns']

for typ, include, exclude in zip(type_list, include_list, exclude_list):
doc1[typ] = test_list1
doc2[typ] = test_list2

# Passing lists to both include_ and exclude_ arguments
kwargs = {include: ['sample_weight'], exclude: ['sample_weight']}
assert_raises(TypeError, assert_consistent_docs, [doc1, doc2],
**kwargs)

# Passing list to exclude_ argument when include_ is set to False
kwargs = {include: False, exclude: ['sample_weight']}
assert_raises(TypeError, assert_consistent_docs, [doc1, doc2],
**kwargs)
doc1[typ] = doc2[typ] = []


@if_numpydoc
def test_precedence_assert_consistent_docs():
# Test order of error reporting
from numpydoc import docscrape

doc1 = docscrape.NumpyDocString(inspect.getdoc(func_doc1))
doc2 = docscrape.NumpyDocString(inspect.getdoc(func_doc2))

doc1['Attributes'] = doc1['Returns'] = doc1['Parameters']
doc2['Attributes'] = doc2['Returns'] = doc2['Parameters']

# Mismatch in Parameter reported first
error_msg = ("Parameter 'sample_weight' of 'Object 2' has inconsistent"
" description with that of 'Object 1'.")
assert_raise_message(AssertionError, error_msg, assert_consistent_docs,
[doc1, doc2], include_params=True,
include_returns=True, include_attribs=True)

# Mismatch in Attribute reported second
error_msg = ("Attribute 'sample_weight' of 'Object 2' has inconsistent"
" description with that of 'Object 1'.")
assert_raise_message(AssertionError, error_msg, assert_consistent_docs,
[doc1, doc2], include_params=False,
include_returns=True, include_attribs=True)

# Mismatch in Returns reported last
error_msg = ("Return 'sample_weight' of 'Object 2' has inconsistent"
" description with that of 'Object 1'.")
assert_raise_message(AssertionError, error_msg, assert_consistent_docs,
[doc1, doc2], include_params=False,
include_returns=True, include_attribs=False)