pruned venvs
This commit is contained in:
@@ -1,380 +0,0 @@
|
||||
"""
|
||||
==========================================
|
||||
Statistical functions (:mod:`scipy.stats`)
|
||||
==========================================
|
||||
|
||||
.. module:: scipy.stats
|
||||
|
||||
This module contains a large number of probability distributions as
|
||||
well as a growing library of statistical functions.
|
||||
|
||||
Each univariate distribution is an instance of a subclass of `rv_continuous`
|
||||
(`rv_discrete` for discrete distributions):
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rv_continuous
|
||||
rv_discrete
|
||||
rv_histogram
|
||||
|
||||
Continuous distributions
|
||||
========================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
alpha -- Alpha
|
||||
anglit -- Anglit
|
||||
arcsine -- Arcsine
|
||||
argus -- Argus
|
||||
beta -- Beta
|
||||
betaprime -- Beta Prime
|
||||
bradford -- Bradford
|
||||
burr -- Burr (Type III)
|
||||
burr12 -- Burr (Type XII)
|
||||
cauchy -- Cauchy
|
||||
chi -- Chi
|
||||
chi2 -- Chi-squared
|
||||
cosine -- Cosine
|
||||
crystalball -- Crystalball
|
||||
dgamma -- Double Gamma
|
||||
dweibull -- Double Weibull
|
||||
erlang -- Erlang
|
||||
expon -- Exponential
|
||||
exponnorm -- Exponentially Modified Normal
|
||||
exponweib -- Exponentiated Weibull
|
||||
exponpow -- Exponential Power
|
||||
f -- F (Snecdor F)
|
||||
fatiguelife -- Fatigue Life (Birnbaum-Saunders)
|
||||
fisk -- Fisk
|
||||
foldcauchy -- Folded Cauchy
|
||||
foldnorm -- Folded Normal
|
||||
frechet_r -- Deprecated. Alias for weibull_min
|
||||
frechet_l -- Deprecated. Alias for weibull_max
|
||||
genlogistic -- Generalized Logistic
|
||||
gennorm -- Generalized normal
|
||||
genpareto -- Generalized Pareto
|
||||
genexpon -- Generalized Exponential
|
||||
genextreme -- Generalized Extreme Value
|
||||
gausshyper -- Gauss Hypergeometric
|
||||
gamma -- Gamma
|
||||
gengamma -- Generalized gamma
|
||||
genhalflogistic -- Generalized Half Logistic
|
||||
gilbrat -- Gilbrat
|
||||
gompertz -- Gompertz (Truncated Gumbel)
|
||||
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
|
||||
gumbel_l -- Left Sided Gumbel, etc.
|
||||
halfcauchy -- Half Cauchy
|
||||
halflogistic -- Half Logistic
|
||||
halfnorm -- Half Normal
|
||||
halfgennorm -- Generalized Half Normal
|
||||
hypsecant -- Hyperbolic Secant
|
||||
invgamma -- Inverse Gamma
|
||||
invgauss -- Inverse Gaussian
|
||||
invweibull -- Inverse Weibull
|
||||
johnsonsb -- Johnson SB
|
||||
johnsonsu -- Johnson SU
|
||||
kappa4 -- Kappa 4 parameter
|
||||
kappa3 -- Kappa 3 parameter
|
||||
ksone -- Kolmogorov-Smirnov one-sided (no stats)
|
||||
kstwobign -- Kolmogorov-Smirnov two-sided test for Large N (no stats)
|
||||
laplace -- Laplace
|
||||
levy -- Levy
|
||||
levy_l
|
||||
levy_stable
|
||||
logistic -- Logistic
|
||||
loggamma -- Log-Gamma
|
||||
loglaplace -- Log-Laplace (Log Double Exponential)
|
||||
lognorm -- Log-Normal
|
||||
lomax -- Lomax (Pareto of the second kind)
|
||||
maxwell -- Maxwell
|
||||
mielke -- Mielke's Beta-Kappa
|
||||
moyal -- Moyal
|
||||
nakagami -- Nakagami
|
||||
ncx2 -- Non-central chi-squared
|
||||
ncf -- Non-central F
|
||||
nct -- Non-central Student's T
|
||||
norm -- Normal (Gaussian)
|
||||
norminvgauss -- Normal Inverse Gaussian
|
||||
pareto -- Pareto
|
||||
pearson3 -- Pearson type III
|
||||
powerlaw -- Power-function
|
||||
powerlognorm -- Power log normal
|
||||
powernorm -- Power normal
|
||||
rdist -- R-distribution
|
||||
reciprocal -- Reciprocal
|
||||
rayleigh -- Rayleigh
|
||||
rice -- Rice
|
||||
recipinvgauss -- Reciprocal Inverse Gaussian
|
||||
semicircular -- Semicircular
|
||||
skewnorm -- Skew normal
|
||||
t -- Student's T
|
||||
trapz -- Trapezoidal
|
||||
triang -- Triangular
|
||||
truncexpon -- Truncated Exponential
|
||||
truncnorm -- Truncated Normal
|
||||
tukeylambda -- Tukey-Lambda
|
||||
uniform -- Uniform
|
||||
vonmises -- Von-Mises (Circular)
|
||||
vonmises_line -- Von-Mises (Line)
|
||||
wald -- Wald
|
||||
weibull_min -- Minimum Weibull (see Frechet)
|
||||
weibull_max -- Maximum Weibull (see Frechet)
|
||||
wrapcauchy -- Wrapped Cauchy
|
||||
|
||||
Multivariate distributions
|
||||
==========================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
multivariate_normal -- Multivariate normal distribution
|
||||
matrix_normal -- Matrix normal distribution
|
||||
dirichlet -- Dirichlet
|
||||
wishart -- Wishart
|
||||
invwishart -- Inverse Wishart
|
||||
multinomial -- Multinomial distribution
|
||||
special_ortho_group -- SO(N) group
|
||||
ortho_group -- O(N) group
|
||||
unitary_group -- U(N) group
|
||||
random_correlation -- random correlation matrices
|
||||
|
||||
Discrete distributions
|
||||
======================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
bernoulli -- Bernoulli
|
||||
binom -- Binomial
|
||||
boltzmann -- Boltzmann (Truncated Discrete Exponential)
|
||||
dlaplace -- Discrete Laplacian
|
||||
geom -- Geometric
|
||||
hypergeom -- Hypergeometric
|
||||
logser -- Logarithmic (Log-Series, Series)
|
||||
nbinom -- Negative Binomial
|
||||
planck -- Planck (Discrete Exponential)
|
||||
poisson -- Poisson
|
||||
randint -- Discrete Uniform
|
||||
skellam -- Skellam
|
||||
zipf -- Zipf
|
||||
yulesimon -- Yule-Simon
|
||||
|
||||
An overview of statistical functions is given below.
|
||||
Several of these functions have a similar version in
|
||||
`scipy.stats.mstats` which work for masked arrays.
|
||||
|
||||
Summary statistics
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe -- Descriptive statistics
|
||||
gmean -- Geometric mean
|
||||
hmean -- Harmonic mean
|
||||
kurtosis -- Fisher or Pearson kurtosis
|
||||
mode -- Modal value
|
||||
moment -- Central moment
|
||||
skew -- Skewness
|
||||
kstat --
|
||||
kstatvar --
|
||||
tmean -- Truncated arithmetic mean
|
||||
tvar -- Truncated variance
|
||||
tmin --
|
||||
tmax --
|
||||
tstd --
|
||||
tsem --
|
||||
variation -- Coefficient of variation
|
||||
find_repeats
|
||||
trim_mean
|
||||
iqr
|
||||
sem
|
||||
bayes_mvs
|
||||
mvsdist
|
||||
entropy
|
||||
|
||||
Frequency statistics
|
||||
====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
cumfreq
|
||||
itemfreq
|
||||
percentileofscore
|
||||
scoreatpercentile
|
||||
relfreq
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
binned_statistic -- Compute a binned statistic for a set of data.
|
||||
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
|
||||
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
|
||||
|
||||
Correlation functions
|
||||
=====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
weightedtau
|
||||
linregress
|
||||
siegelslopes
|
||||
theilslopes
|
||||
|
||||
Statistical tests
|
||||
=================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
ttest_ind
|
||||
ttest_ind_from_stats
|
||||
ttest_rel
|
||||
kstest
|
||||
chisquare
|
||||
power_divergence
|
||||
ks_2samp
|
||||
mannwhitneyu
|
||||
tiecorrect
|
||||
rankdata
|
||||
ranksums
|
||||
wilcoxon
|
||||
kruskal
|
||||
friedmanchisquare
|
||||
brunnermunzel
|
||||
combine_pvalues
|
||||
jarque_bera
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ansari
|
||||
bartlett
|
||||
levene
|
||||
shapiro
|
||||
anderson
|
||||
anderson_ksamp
|
||||
binom_test
|
||||
fligner
|
||||
median_test
|
||||
mood
|
||||
skewtest
|
||||
kurtosistest
|
||||
normaltest
|
||||
|
||||
Transformations
|
||||
===============
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
boxcox
|
||||
boxcox_normmax
|
||||
boxcox_llf
|
||||
yeojohnson
|
||||
yeojohnson_normmax
|
||||
yeojohnson_llf
|
||||
obrientransform
|
||||
sigmaclip
|
||||
trimboth
|
||||
trim1
|
||||
zmap
|
||||
zscore
|
||||
|
||||
Statistical distances
|
||||
=====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
wasserstein_distance
|
||||
energy_distance
|
||||
|
||||
Random variate generation
|
||||
=========================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rvs_ratio_uniforms
|
||||
|
||||
Circular statistical functions
|
||||
==============================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
circmean
|
||||
circvar
|
||||
circstd
|
||||
|
||||
Contingency table functions
|
||||
===========================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chi2_contingency
|
||||
contingency.expected_freq
|
||||
contingency.margins
|
||||
fisher_exact
|
||||
|
||||
Plot-tests
|
||||
==========
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ppcc_max
|
||||
ppcc_plot
|
||||
probplot
|
||||
boxcox_normplot
|
||||
yeojohnson_normplot
|
||||
|
||||
|
||||
Masked statistics functions
|
||||
===========================
|
||||
|
||||
.. toctree::
|
||||
|
||||
stats.mstats
|
||||
|
||||
|
||||
Univariate and multivariate kernel density estimation (:mod:`scipy.stats.kde`)
|
||||
==============================================================================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
gaussian_kde
|
||||
|
||||
For many more stat related functions install the software R and the
|
||||
interface package rpy.
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from .stats import *
|
||||
from .distributions import *
|
||||
from .morestats import *
|
||||
from ._binned_statistic import *
|
||||
from .kde import gaussian_kde
|
||||
from . import mstats
|
||||
from .contingency import chi2_contingency
|
||||
from ._multivariate import *
|
||||
|
||||
__all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
|
||||
|
||||
from scipy._lib._testutils import PytestTester
|
||||
test = PytestTester(__name__)
|
||||
del PytestTester
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
@@ -1,619 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from scipy._lib.six import callable, xrange
|
||||
from scipy._lib._numpy_compat import suppress_warnings
|
||||
from collections import namedtuple
|
||||
|
||||
__all__ = ['binned_statistic',
|
||||
'binned_statistic_2d',
|
||||
'binned_statistic_dd']
|
||||
|
||||
|
||||
BinnedStatisticResult = namedtuple('BinnedStatisticResult',
|
||||
('statistic', 'bin_edges', 'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic(x, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a set of sequences - each the same shape as
|
||||
`x`. If `values` is a set of sequences, the statistic will be computed
|
||||
on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or sequence of scalars, optional
|
||||
If `bins` is an int, it defines the number of equal-width bins in the
|
||||
given range (10 by default). If `bins` is a sequence, it defines the
|
||||
bin edges, including the rightmost edge, allowing for non-uniform bin
|
||||
widths. Values in `x` that are smaller than lowest bin edge are
|
||||
assigned to bin number 0, values beyond the highest bin are assigned to
|
||||
``bins[-1]``. If the bin edges are specified, the number of bins will
|
||||
be, (nx = len(bins)-1).
|
||||
range : (float, float) or [(float, float)], optional
|
||||
The lower and upper range of the bins. If not provided, range
|
||||
is simply ``(x.min(), x.max())``. Values outside the range are
|
||||
ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : array
|
||||
The values of the selected statistic in each bin.
|
||||
bin_edges : array of dtype float
|
||||
Return the bin edges ``(length(statistic)+1)``.
|
||||
binnumber: 1-D ndarray of ints
|
||||
Indices of the bins (corresponding to `bin_edges`) in which each value
|
||||
of `x` belongs. Same length as `values`. A binnumber of `i` means the
|
||||
corresponding value is between (bin_edges[i-1], bin_edges[i]).
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
First some basic examples:
|
||||
|
||||
Create two evenly spaced bins in the range of the given sample, and sum the
|
||||
corresponding values in each of those bins:
|
||||
|
||||
>>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
(array([ 4. , 4.5]), array([ 1., 4., 7.]), array([1, 1, 1, 2, 2]))
|
||||
|
||||
Multiple arrays of values can also be passed. The statistic is calculated
|
||||
on each set independently:
|
||||
|
||||
>>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
(array([[ 4. , 4.5], [ 8. , 9. ]]), array([ 1., 4., 7.]),
|
||||
array([1, 1, 1, 2, 2]))
|
||||
|
||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
|
||||
... bins=3)
|
||||
(array([ 1., 2., 4.]), array([ 1., 2., 3., 4.]),
|
||||
array([1, 2, 1, 2, 3]))
|
||||
|
||||
As a second example, we now generate some random data of sailing boat speed
|
||||
as a function of wind speed, and then determine how fast our boat is for
|
||||
certain wind speeds:
|
||||
|
||||
>>> windspeed = 8 * np.random.rand(500)
|
||||
>>> boatspeed = .3 * windspeed**.5 + .2 * np.random.rand(500)
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
|
||||
... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
|
||||
>>> plt.figure()
|
||||
>>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.legend()
|
||||
|
||||
Now we can use ``binnumber`` to select all datapoints with a windspeed
|
||||
below 1:
|
||||
|
||||
>>> low_boatspeed = boatspeed[binnumber == 0]
|
||||
|
||||
As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
|
||||
plot of a distribution that shows the mean and distribution around that
|
||||
mean per bin, on top of a regular histogram and the probability
|
||||
distribution function:
|
||||
|
||||
>>> x = np.linspace(0, 5, num=500)
|
||||
>>> x_pdf = stats.maxwell.pdf(x)
|
||||
>>> samples = stats.maxwell.rvs(size=10000)
|
||||
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
|
||||
... statistic='mean', bins=25)
|
||||
>>> bin_width = (bin_edges[1] - bin_edges[0])
|
||||
>>> bin_centers = bin_edges[1:] - bin_width/2
|
||||
|
||||
>>> plt.figure()
|
||||
>>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
|
||||
... alpha=0.2, label='histogram of data')
|
||||
>>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
|
||||
>>> plt.legend(fontsize=10)
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1:
|
||||
bins = [np.asarray(bins, float)]
|
||||
|
||||
if range is not None:
|
||||
if len(range) == 2:
|
||||
range = [range]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x], values, statistic, bins, range)
|
||||
|
||||
return BinnedStatisticResult(medians, edges[0], binnumbers)
|
||||
|
||||
|
||||
BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
|
||||
('statistic', 'x_edge', 'y_edge',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic_2d(x, y, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False):
|
||||
"""
|
||||
Compute a bidimensional binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram2d function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned along the first dimension.
|
||||
y : (N,) array_like
|
||||
A sequence of values to be binned along the second dimension.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a list of sequences - each with the same
|
||||
shape as `x`. If `values` is such a list, the statistic will be
|
||||
computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or [int, int] or array_like or [array, array], optional
|
||||
The bin specification:
|
||||
|
||||
* the number of bins for the two dimensions (nx = ny = bins),
|
||||
* the number of bins in each dimension (nx, ny = bins),
|
||||
* the bin edges for the two dimensions (x_edge = y_edge = bins),
|
||||
* the bin edges in each dimension (x_edge, y_edge = bins).
|
||||
|
||||
If the bin edges are specified, the number of bins will be,
|
||||
(nx = len(x_edge)-1, ny = len(y_edge)-1).
|
||||
|
||||
range : (2,2) array_like, optional
|
||||
The leftmost and rightmost edges of the bins along each dimension
|
||||
(if not specified explicitly in the `bins` parameters):
|
||||
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
|
||||
considered outliers and not tallied in the histogram.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section.
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : (nx, ny) ndarray
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
x_edge : (nx + 1) ndarray
|
||||
The bin edges along the first dimension.
|
||||
y_edge : (ny + 1) ndarray
|
||||
The bin edges along the second dimension.
|
||||
binnumber : (N,) array of ints or (2,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
If 'True': The returned `binnumber` is a shape (2,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
|
||||
Calculate the counts with explicit bin-edges:
|
||||
|
||||
>>> x = [0.1, 0.1, 0.1, 0.6]
|
||||
>>> y = [2.1, 2.6, 2.1, 2.1]
|
||||
>>> binx = [0.0, 0.5, 1.0]
|
||||
>>> biny = [2.0, 2.5, 3.0]
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx,biny])
|
||||
>>> ret.statistic
|
||||
array([[ 2., 1.],
|
||||
[ 1., 0.]])
|
||||
|
||||
The bin in which each sample is placed is given by the `binnumber`
|
||||
returned parameter. By default, these are the linearized bin indices:
|
||||
|
||||
>>> ret.binnumber
|
||||
array([5, 6, 5, 9])
|
||||
|
||||
The bin indices can also be expanded into separate entries for each
|
||||
dimension using the `expand_binnumbers` parameter:
|
||||
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx,biny],
|
||||
... expand_binnumbers=True)
|
||||
>>> ret.binnumber
|
||||
array([[1, 1, 1, 2],
|
||||
[1, 2, 1, 1]])
|
||||
|
||||
Which shows that the first three elements belong in the xbin 1, and the
|
||||
fourth into xbin 2; and so on for y.
|
||||
|
||||
"""
|
||||
|
||||
# This code is based on np.histogram2d
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1 and N != 2:
|
||||
xedges = yedges = np.asarray(bins, float)
|
||||
bins = [xedges, yedges]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x, y], values, statistic, bins, range,
|
||||
expand_binnumbers=expand_binnumbers)
|
||||
|
||||
return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
|
||||
|
||||
|
||||
BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
|
||||
('statistic', 'bin_edges',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic_dd(sample, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False):
|
||||
"""
|
||||
Compute a multidimensional binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogramdd function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : array_like
|
||||
Data to histogram passed as a sequence of D arrays of length N, or
|
||||
as an (N,D) array.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `sample`, or a list of sequences - each with the
|
||||
same shape as `sample`. If `values` is such a list, the statistic
|
||||
will be computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : sequence or int, optional
|
||||
The bin specification must be in one of the following forms:
|
||||
|
||||
* A sequence of arrays describing the bin edges along each dimension.
|
||||
* The number of bins for each dimension (nx, ny, ... = bins).
|
||||
* The number of bins for all dimensions (nx = ny = ... = bins).
|
||||
|
||||
range : sequence, optional
|
||||
A sequence of lower and upper bin edges to be used if the edges are
|
||||
not given explicitly in `bins`. Defaults to the minimum and maximum
|
||||
values along each dimension.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section of
|
||||
`binned_statistic_2d`.
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : ndarray, shape(nx1, nx2, nx3,...)
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
bin_edges : list of ndarrays
|
||||
A list of D arrays describing the (nxi + 1) bin edges for each
|
||||
dimension.
|
||||
binnumber : (N,) array of ints or (D,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open in each dimension. In
|
||||
other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
|
||||
``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
|
||||
last bin, however, is ``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
If 'True': The returned `binnumber` is a shape (D,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
"""
|
||||
known_stats = ['mean', 'median', 'count', 'sum', 'std','min','max']
|
||||
if not callable(statistic) and statistic not in known_stats:
|
||||
raise ValueError('invalid statistic %r' % (statistic,))
|
||||
|
||||
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
|
||||
# `Dlen` is the length of elements along each dimension.
|
||||
# This code is based on np.histogramdd
|
||||
try:
|
||||
# `sample` is an ND-array.
|
||||
Dlen, Ndim = sample.shape
|
||||
except (AttributeError, ValueError):
|
||||
# `sample` is a sequence of 1D arrays.
|
||||
sample = np.atleast_2d(sample).T
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
# Store initial shape of `values` to preserve it in the output
|
||||
values = np.asarray(values)
|
||||
input_shape = list(values.shape)
|
||||
# Make sure that `values` is 2D to iterate over rows
|
||||
values = np.atleast_2d(values)
|
||||
Vdim, Vlen = values.shape
|
||||
|
||||
# Make sure `values` match `sample`
|
||||
if(statistic != 'count' and Vlen != Dlen):
|
||||
raise AttributeError('The number of `values` elements must match the '
|
||||
'length of each `sample` dimension.')
|
||||
|
||||
nbin = np.empty(Ndim, int) # Number of bins in each dimension
|
||||
edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
|
||||
dedges = Ndim * [None] # Spacing between edges (will be 2D array)
|
||||
|
||||
try:
|
||||
M = len(bins)
|
||||
if M != Ndim:
|
||||
raise AttributeError('The dimension of bins must be equal '
|
||||
'to the dimension of the sample x.')
|
||||
except TypeError:
|
||||
bins = Ndim * [bins]
|
||||
|
||||
# Select range for each dimension
|
||||
# Used only if number of bins is given.
|
||||
if range is None:
|
||||
smin = np.atleast_1d(np.array(sample.min(axis=0), float))
|
||||
smax = np.atleast_1d(np.array(sample.max(axis=0), float))
|
||||
else:
|
||||
smin = np.zeros(Ndim)
|
||||
smax = np.zeros(Ndim)
|
||||
for i in xrange(Ndim):
|
||||
smin[i], smax[i] = range[i]
|
||||
|
||||
# Make sure the bins have a finite width.
|
||||
for i in xrange(len(smin)):
|
||||
if smin[i] == smax[i]:
|
||||
smin[i] = smin[i] - .5
|
||||
smax[i] = smax[i] + .5
|
||||
|
||||
# Create edge arrays
|
||||
for i in xrange(Ndim):
|
||||
if np.isscalar(bins[i]):
|
||||
nbin[i] = bins[i] + 2 # +2 for outlier bins
|
||||
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
|
||||
else:
|
||||
edges[i] = np.asarray(bins[i], float)
|
||||
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
|
||||
dedges[i] = np.diff(edges[i])
|
||||
|
||||
nbin = np.asarray(nbin)
|
||||
|
||||
# Compute the bin number each sample falls into, in each dimension
|
||||
sampBin = [
|
||||
np.digitize(sample[:, i], edges[i])
|
||||
for i in xrange(Ndim)
|
||||
]
|
||||
|
||||
# Using `digitize`, values that fall on an edge are put in the right bin.
|
||||
# For the rightmost bin, we want values equal to the right
|
||||
# edge to be counted in the last bin, and not as an outlier.
|
||||
for i in xrange(Ndim):
|
||||
# Find the rounding precision
|
||||
decimal = int(-np.log10(dedges[i].min())) + 6
|
||||
# Find which points are on the rightmost edge.
|
||||
on_edge = np.where(np.around(sample[:, i], decimal) ==
|
||||
np.around(edges[i][-1], decimal))[0]
|
||||
# Shift these points one bin to the left.
|
||||
sampBin[i][on_edge] -= 1
|
||||
|
||||
# Compute the sample indices in the flattened statistic matrix.
|
||||
binnumbers = np.ravel_multi_index(sampBin, nbin)
|
||||
|
||||
result = np.empty([Vdim, nbin.prod()], float)
|
||||
|
||||
if statistic == 'mean':
|
||||
result.fill(np.nan)
|
||||
flatcount = np.bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in xrange(Vdim):
|
||||
flatsum = np.bincount(binnumbers, values[vv])
|
||||
result[vv, a] = flatsum[a] / flatcount[a]
|
||||
elif statistic == 'std':
|
||||
result.fill(0)
|
||||
flatcount = np.bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in xrange(Vdim):
|
||||
flatsum = np.bincount(binnumbers, values[vv])
|
||||
flatsum2 = np.bincount(binnumbers, values[vv] ** 2)
|
||||
result[vv, a] = np.sqrt(flatsum2[a] / flatcount[a] -
|
||||
(flatsum[a] / flatcount[a]) ** 2)
|
||||
elif statistic == 'count':
|
||||
result.fill(0)
|
||||
flatcount = np.bincount(binnumbers, None)
|
||||
a = np.arange(len(flatcount))
|
||||
result[:, a] = flatcount[np.newaxis, :]
|
||||
elif statistic == 'sum':
|
||||
result.fill(0)
|
||||
for vv in xrange(Vdim):
|
||||
flatsum = np.bincount(binnumbers, values[vv])
|
||||
a = np.arange(len(flatsum))
|
||||
result[vv, a] = flatsum
|
||||
elif statistic == 'median':
|
||||
result.fill(np.nan)
|
||||
for i in np.unique(binnumbers):
|
||||
for vv in xrange(Vdim):
|
||||
result[vv, i] = np.median(values[vv, binnumbers == i])
|
||||
elif statistic == 'min':
|
||||
result.fill(np.nan)
|
||||
for i in np.unique(binnumbers):
|
||||
for vv in xrange(Vdim):
|
||||
result[vv, i] = np.min(values[vv, binnumbers == i])
|
||||
elif statistic == 'max':
|
||||
result.fill(np.nan)
|
||||
for i in np.unique(binnumbers):
|
||||
for vv in xrange(Vdim):
|
||||
result[vv, i] = np.max(values[vv, binnumbers == i])
|
||||
elif callable(statistic):
|
||||
with np.errstate(invalid='ignore'), suppress_warnings() as sup:
|
||||
sup.filter(RuntimeWarning)
|
||||
try:
|
||||
null = statistic([])
|
||||
except Exception:
|
||||
null = np.nan
|
||||
result.fill(null)
|
||||
for i in np.unique(binnumbers):
|
||||
for vv in xrange(Vdim):
|
||||
result[vv, i] = statistic(values[vv, binnumbers == i])
|
||||
|
||||
# Shape into a proper matrix
|
||||
result = result.reshape(np.append(Vdim, nbin))
|
||||
|
||||
# Remove outliers (indices 0 and -1 for each bin-dimension).
|
||||
core = tuple([slice(None)] + Ndim * [slice(1, -1)])
|
||||
result = result[core]
|
||||
|
||||
# Unravel binnumbers into an ndarray, each row the bins for each dimension
|
||||
if(expand_binnumbers and Ndim > 1):
|
||||
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
|
||||
|
||||
if np.any(result.shape[1:] != nbin - 2):
|
||||
raise RuntimeError('Internal Shape Error')
|
||||
|
||||
# Reshape to have output (`reulst`) match input (`values`) shape
|
||||
result = result.reshape(input_shape[:-1] + list(nbin-2))
|
||||
|
||||
return BinnedStatisticddResult(result, edges, binnumbers)
|
||||
@@ -1,27 +0,0 @@
|
||||
"""
|
||||
Statistics-related constants.
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
|
||||
_EPS = np.finfo(float).eps
|
||||
|
||||
# The largest [in magnitude] usable floating value.
|
||||
_XMAX = np.finfo(float).max
|
||||
|
||||
# The log of the largest usable floating value; useful for knowing
|
||||
# when exp(something) will overflow
|
||||
_LOGXMAX = np.log(_XMAX)
|
||||
|
||||
# The smallest [in magnitude] usable floating value.
|
||||
_XMIN = np.finfo(float).tiny
|
||||
|
||||
# -special.psi(1)
|
||||
_EULER = 0.577215664901532860606512090082402431042
|
||||
|
||||
# special.zeta(3, 1) Apery's constant
|
||||
_ZETA3 = 1.202056903159594285399738161511449990765
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,969 +0,0 @@
|
||||
#
|
||||
# Author: Travis Oliphant 2002-2011 with contributions from
|
||||
# SciPy Developers 2004-2011
|
||||
#
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from scipy import special
|
||||
from scipy.special import entr, logsumexp, betaln, gammaln as gamln
|
||||
from scipy._lib._numpy_compat import broadcast_to
|
||||
from scipy._lib._util import _lazywhere
|
||||
|
||||
from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ._distn_infrastructure import (
|
||||
rv_discrete, _ncx2_pdf, _ncx2_cdf, get_distribution_names)
|
||||
|
||||
|
||||
class binom_gen(rv_discrete):
|
||||
r"""A binomial discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `binom` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = \binom{n}{k} p^k (1-p)^{n-k}
|
||||
|
||||
for ``k`` in ``{0, 1,..., n}``.
|
||||
|
||||
`binom` takes ``n`` and ``p`` as shape parameters.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, n, p):
|
||||
return self._random_state.binomial(n, p, self._size)
|
||||
|
||||
def _argcheck(self, n, p):
|
||||
self.b = n
|
||||
return (n >= 0) & (p >= 0) & (p <= 1)
|
||||
|
||||
def _logpmf(self, x, n, p):
|
||||
k = floor(x)
|
||||
combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1)))
|
||||
return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p)
|
||||
|
||||
def _pmf(self, x, n, p):
|
||||
# binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k)
|
||||
return exp(self._logpmf(x, n, p))
|
||||
|
||||
def _cdf(self, x, n, p):
|
||||
k = floor(x)
|
||||
vals = special.bdtr(k, n, p)
|
||||
return vals
|
||||
|
||||
def _sf(self, x, n, p):
|
||||
k = floor(x)
|
||||
return special.bdtrc(k, n, p)
|
||||
|
||||
def _ppf(self, q, n, p):
|
||||
vals = ceil(special.bdtrik(q, n, p))
|
||||
vals1 = np.maximum(vals - 1, 0)
|
||||
temp = special.bdtr(vals1, n, p)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, n, p, moments='mv'):
|
||||
q = 1.0 - p
|
||||
mu = n * p
|
||||
var = n * p * q
|
||||
g1, g2 = None, None
|
||||
if 's' in moments:
|
||||
g1 = (q - p) / sqrt(var)
|
||||
if 'k' in moments:
|
||||
g2 = (1.0 - 6*p*q) / var
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _entropy(self, n, p):
|
||||
k = np.r_[0:n + 1]
|
||||
vals = self._pmf(k, n, p)
|
||||
return np.sum(entr(vals), axis=0)
|
||||
|
||||
|
||||
binom = binom_gen(name='binom')
|
||||
|
||||
|
||||
class bernoulli_gen(binom_gen):
|
||||
r"""A Bernoulli discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `bernoulli` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = \begin{cases}1-p &\text{if } k = 0\\
|
||||
p &\text{if } k = 1\end{cases}
|
||||
|
||||
for :math:`k` in :math:`\{0, 1\}`.
|
||||
|
||||
`bernoulli` takes :math:`p` as shape parameter.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, p):
|
||||
return binom_gen._rvs(self, 1, p)
|
||||
|
||||
def _argcheck(self, p):
|
||||
return (p >= 0) & (p <= 1)
|
||||
|
||||
def _logpmf(self, x, p):
|
||||
return binom._logpmf(x, 1, p)
|
||||
|
||||
def _pmf(self, x, p):
|
||||
# bernoulli.pmf(k) = 1-p if k = 0
|
||||
# = p if k = 1
|
||||
return binom._pmf(x, 1, p)
|
||||
|
||||
def _cdf(self, x, p):
|
||||
return binom._cdf(x, 1, p)
|
||||
|
||||
def _sf(self, x, p):
|
||||
return binom._sf(x, 1, p)
|
||||
|
||||
def _ppf(self, q, p):
|
||||
return binom._ppf(q, 1, p)
|
||||
|
||||
def _stats(self, p):
|
||||
return binom._stats(1, p)
|
||||
|
||||
def _entropy(self, p):
|
||||
return entr(p) + entr(1-p)
|
||||
|
||||
|
||||
bernoulli = bernoulli_gen(b=1, name='bernoulli')
|
||||
|
||||
|
||||
class nbinom_gen(rv_discrete):
|
||||
r"""A negative binomial discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
Negative binomial distribution describes a sequence of i.i.d. Bernoulli
|
||||
trials, repeated until a predefined, non-random number of successes occurs.
|
||||
|
||||
The probability mass function of the number of failures for `nbinom` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = \binom{k+n-1}{n-1} p^n (1-p)^k
|
||||
|
||||
for :math:`k \ge 0`.
|
||||
|
||||
`nbinom` takes :math:`n` and :math:`p` as shape parameters where n is the
|
||||
number of successes, whereas p is the probability of a single success.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, n, p):
|
||||
return self._random_state.negative_binomial(n, p, self._size)
|
||||
|
||||
def _argcheck(self, n, p):
|
||||
return (n > 0) & (p >= 0) & (p <= 1)
|
||||
|
||||
def _pmf(self, x, n, p):
|
||||
# nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k
|
||||
return exp(self._logpmf(x, n, p))
|
||||
|
||||
def _logpmf(self, x, n, p):
|
||||
coeff = gamln(n+x) - gamln(x+1) - gamln(n)
|
||||
return coeff + n*log(p) + special.xlog1py(x, -p)
|
||||
|
||||
def _cdf(self, x, n, p):
|
||||
k = floor(x)
|
||||
return special.betainc(n, k+1, p)
|
||||
|
||||
def _sf_skip(self, x, n, p):
|
||||
# skip because special.nbdtrc doesn't work for 0<n<1
|
||||
k = floor(x)
|
||||
return special.nbdtrc(k, n, p)
|
||||
|
||||
def _ppf(self, q, n, p):
|
||||
vals = ceil(special.nbdtrik(q, n, p))
|
||||
vals1 = (vals-1).clip(0.0, np.inf)
|
||||
temp = self._cdf(vals1, n, p)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, n, p):
|
||||
Q = 1.0 / p
|
||||
P = Q - 1.0
|
||||
mu = n*P
|
||||
var = n*P*Q
|
||||
g1 = (Q+P)/sqrt(n*P*Q)
|
||||
g2 = (1.0 + 6*P*Q) / (n*P*Q)
|
||||
return mu, var, g1, g2
|
||||
|
||||
|
||||
nbinom = nbinom_gen(name='nbinom')
|
||||
|
||||
|
||||
class geom_gen(rv_discrete):
|
||||
r"""A geometric discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `geom` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = (1-p)^{k-1} p
|
||||
|
||||
for :math:`k \ge 1`.
|
||||
|
||||
`geom` takes :math:`p` as shape parameter.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, p):
|
||||
return self._random_state.geometric(p, size=self._size)
|
||||
|
||||
def _argcheck(self, p):
|
||||
return (p <= 1) & (p >= 0)
|
||||
|
||||
def _pmf(self, k, p):
|
||||
# geom.pmf(k) = (1-p)**(k-1)*p
|
||||
return np.power(1-p, k-1) * p
|
||||
|
||||
def _logpmf(self, k, p):
|
||||
return special.xlog1py(k - 1, -p) + log(p)
|
||||
|
||||
def _cdf(self, x, p):
|
||||
k = floor(x)
|
||||
return -expm1(log1p(-p)*k)
|
||||
|
||||
def _sf(self, x, p):
|
||||
return np.exp(self._logsf(x, p))
|
||||
|
||||
def _logsf(self, x, p):
|
||||
k = floor(x)
|
||||
return k*log1p(-p)
|
||||
|
||||
def _ppf(self, q, p):
|
||||
vals = ceil(log1p(-q) / log1p(-p))
|
||||
temp = self._cdf(vals-1, p)
|
||||
return np.where((temp >= q) & (vals > 0), vals-1, vals)
|
||||
|
||||
def _stats(self, p):
|
||||
mu = 1.0/p
|
||||
qr = 1.0-p
|
||||
var = qr / p / p
|
||||
g1 = (2.0-p) / sqrt(qr)
|
||||
g2 = np.polyval([1, -6, 6], p)/(1.0-p)
|
||||
return mu, var, g1, g2
|
||||
|
||||
|
||||
geom = geom_gen(a=1, name='geom', longname="A geometric")
|
||||
|
||||
|
||||
class hypergeom_gen(rv_discrete):
|
||||
r"""A hypergeometric discrete random variable.
|
||||
|
||||
The hypergeometric distribution models drawing objects from a bin.
|
||||
`M` is the total number of objects, `n` is total number of Type I objects.
|
||||
The random variate represents the number of Type I objects in `N` drawn
|
||||
without replacement from the total population.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The symbols used to denote the shape parameters (`M`, `n`, and `N`) are not
|
||||
universally accepted. See the Examples for a clarification of the
|
||||
definitions used here.
|
||||
|
||||
The probability mass function is defined as,
|
||||
|
||||
.. math:: p(k, M, n, N) = \frac{\binom{n}{k} \binom{M - n}{N - k}}
|
||||
{\binom{M}{N}}
|
||||
|
||||
for :math:`k \in [\max(0, N - M + n), \min(n, N)]`, where the binomial
|
||||
coefficients are defined as,
|
||||
|
||||
.. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import hypergeom
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
Suppose we have a collection of 20 animals, of which 7 are dogs. Then if
|
||||
we want to know the probability of finding a given number of dogs if we
|
||||
choose at random 12 of the 20 animals, we can initialize a frozen
|
||||
distribution and plot the probability mass function:
|
||||
|
||||
>>> [M, n, N] = [20, 7, 12]
|
||||
>>> rv = hypergeom(M, n, N)
|
||||
>>> x = np.arange(0, n+1)
|
||||
>>> pmf_dogs = rv.pmf(x)
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, pmf_dogs, 'bo')
|
||||
>>> ax.vlines(x, 0, pmf_dogs, lw=2)
|
||||
>>> ax.set_xlabel('# of dogs in our group of chosen animals')
|
||||
>>> ax.set_ylabel('hypergeom PMF')
|
||||
>>> plt.show()
|
||||
|
||||
Instead of using a frozen distribution we can also use `hypergeom`
|
||||
methods directly. To for example obtain the cumulative distribution
|
||||
function, use:
|
||||
|
||||
>>> prb = hypergeom.cdf(x, M, n, N)
|
||||
|
||||
And to generate random numbers:
|
||||
|
||||
>>> R = hypergeom.rvs(M, n, N, size=10)
|
||||
|
||||
"""
|
||||
def _rvs(self, M, n, N):
|
||||
return self._random_state.hypergeometric(n, M-n, N, size=self._size)
|
||||
|
||||
def _argcheck(self, M, n, N):
|
||||
cond = (M > 0) & (n >= 0) & (N >= 0)
|
||||
cond &= (n <= M) & (N <= M)
|
||||
self.a = np.maximum(N-(M-n), 0)
|
||||
self.b = np.minimum(n, N)
|
||||
return cond
|
||||
|
||||
def _logpmf(self, k, M, n, N):
|
||||
tot, good = M, n
|
||||
bad = tot - good
|
||||
result = (betaln(good+1, 1) + betaln(bad+1, 1) + betaln(tot-N+1, N+1) -
|
||||
betaln(k+1, good-k+1) - betaln(N-k+1, bad-N+k+1) -
|
||||
betaln(tot+1, 1))
|
||||
return result
|
||||
|
||||
def _pmf(self, k, M, n, N):
|
||||
# same as the following but numerically more precise
|
||||
# return comb(good, k) * comb(bad, N-k) / comb(tot, N)
|
||||
return exp(self._logpmf(k, M, n, N))
|
||||
|
||||
def _stats(self, M, n, N):
|
||||
# tot, good, sample_size = M, n, N
|
||||
# "wikipedia".replace('N', 'M').replace('n', 'N').replace('K', 'n')
|
||||
M, n, N = 1.*M, 1.*n, 1.*N
|
||||
m = M - n
|
||||
p = n/M
|
||||
mu = N*p
|
||||
|
||||
var = m*n*N*(M - N)*1.0/(M*M*(M-1))
|
||||
g1 = (m - n)*(M-2*N) / (M-2.0) * sqrt((M-1.0) / (m*n*N*(M-N)))
|
||||
|
||||
g2 = M*(M+1) - 6.*N*(M-N) - 6.*n*m
|
||||
g2 *= (M-1)*M*M
|
||||
g2 += 6.*n*N*(M-N)*m*(5.*M-6)
|
||||
g2 /= n * N * (M-N) * m * (M-2.) * (M-3.)
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _entropy(self, M, n, N):
|
||||
k = np.r_[N - (M - n):min(n, N) + 1]
|
||||
vals = self.pmf(k, M, n, N)
|
||||
return np.sum(entr(vals), axis=0)
|
||||
|
||||
def _sf(self, k, M, n, N):
|
||||
"""More precise calculation, 1 - cdf doesn't cut it."""
|
||||
# This for loop is needed because `k` can be an array. If that's the
|
||||
# case, the sf() method makes M, n and N arrays of the same shape. We
|
||||
# therefore unpack all inputs args, so we can do the manual
|
||||
# integration.
|
||||
res = []
|
||||
for quant, tot, good, draw in zip(k, M, n, N):
|
||||
# Manual integration over probability mass function. More accurate
|
||||
# than integrate.quad.
|
||||
k2 = np.arange(quant + 1, draw + 1)
|
||||
res.append(np.sum(self._pmf(k2, tot, good, draw)))
|
||||
return np.asarray(res)
|
||||
|
||||
def _logsf(self, k, M, n, N):
|
||||
"""
|
||||
More precise calculation than log(sf)
|
||||
"""
|
||||
res = []
|
||||
for quant, tot, good, draw in zip(k, M, n, N):
|
||||
# Integration over probability mass function using logsumexp
|
||||
k2 = np.arange(quant + 1, draw + 1)
|
||||
res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
|
||||
return np.asarray(res)
|
||||
|
||||
|
||||
hypergeom = hypergeom_gen(name='hypergeom')
|
||||
|
||||
|
||||
# FIXME: Fails _cdfvec
|
||||
class logser_gen(rv_discrete):
|
||||
r"""A Logarithmic (Log-Series, Series) discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `logser` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = - \frac{p^k}{k \log(1-p)}
|
||||
|
||||
for :math:`k \ge 1`.
|
||||
|
||||
`logser` takes :math:`p` as shape parameter.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, p):
|
||||
# looks wrong for p>0.5, too few k=1
|
||||
# trying to use generic is worse, no k=1 at all
|
||||
return self._random_state.logseries(p, size=self._size)
|
||||
|
||||
def _argcheck(self, p):
|
||||
return (p > 0) & (p < 1)
|
||||
|
||||
def _pmf(self, k, p):
|
||||
# logser.pmf(k) = - p**k / (k*log(1-p))
|
||||
return -np.power(p, k) * 1.0 / k / special.log1p(-p)
|
||||
|
||||
def _stats(self, p):
|
||||
r = special.log1p(-p)
|
||||
mu = p / (p - 1.0) / r
|
||||
mu2p = -p / r / (p - 1.0)**2
|
||||
var = mu2p - mu*mu
|
||||
mu3p = -p / r * (1.0+p) / (1.0 - p)**3
|
||||
mu3 = mu3p - 3*mu*mu2p + 2*mu**3
|
||||
g1 = mu3 / np.power(var, 1.5)
|
||||
|
||||
mu4p = -p / r * (
|
||||
1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4)
|
||||
mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4
|
||||
g2 = mu4 / var**2 - 3.0
|
||||
return mu, var, g1, g2
|
||||
|
||||
|
||||
logser = logser_gen(a=1, name='logser', longname='A logarithmic')
|
||||
|
||||
|
||||
class poisson_gen(rv_discrete):
|
||||
r"""A Poisson discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `poisson` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = \exp(-\mu) \frac{\mu^k}{k!}
|
||||
|
||||
for :math:`k \ge 0`.
|
||||
|
||||
`poisson` takes :math:`\mu` as shape parameter.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
|
||||
# Override rv_discrete._argcheck to allow mu=0.
|
||||
def _argcheck(self, mu):
|
||||
return mu >= 0
|
||||
|
||||
def _rvs(self, mu):
|
||||
return self._random_state.poisson(mu, self._size)
|
||||
|
||||
def _logpmf(self, k, mu):
|
||||
Pk = special.xlogy(k, mu) - gamln(k + 1) - mu
|
||||
return Pk
|
||||
|
||||
def _pmf(self, k, mu):
|
||||
# poisson.pmf(k) = exp(-mu) * mu**k / k!
|
||||
return exp(self._logpmf(k, mu))
|
||||
|
||||
def _cdf(self, x, mu):
|
||||
k = floor(x)
|
||||
return special.pdtr(k, mu)
|
||||
|
||||
def _sf(self, x, mu):
|
||||
k = floor(x)
|
||||
return special.pdtrc(k, mu)
|
||||
|
||||
def _ppf(self, q, mu):
|
||||
vals = ceil(special.pdtrik(q, mu))
|
||||
vals1 = np.maximum(vals - 1, 0)
|
||||
temp = special.pdtr(vals1, mu)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, mu):
|
||||
var = mu
|
||||
tmp = np.asarray(mu)
|
||||
mu_nonzero = tmp > 0
|
||||
g1 = _lazywhere(mu_nonzero, (tmp,), lambda x: sqrt(1.0/x), np.inf)
|
||||
g2 = _lazywhere(mu_nonzero, (tmp,), lambda x: 1.0/x, np.inf)
|
||||
return mu, var, g1, g2
|
||||
|
||||
|
||||
poisson = poisson_gen(name="poisson", longname='A Poisson')
|
||||
|
||||
|
||||
class planck_gen(rv_discrete):
|
||||
r"""A Planck discrete exponential random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `planck` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = (1-\exp(-\lambda)) \exp(-\lambda k)
|
||||
|
||||
for :math:`k \ge 0` and :math:`\lambda > 0`.
|
||||
|
||||
`planck` takes :math:`\lambda` as shape parameter.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _argcheck(self, lambda_):
|
||||
return lambda_ > 0
|
||||
|
||||
def _pmf(self, k, lambda_):
|
||||
return (1-exp(-lambda_))*exp(-lambda_*k)
|
||||
|
||||
def _cdf(self, x, lambda_):
|
||||
k = floor(x)
|
||||
return 1-exp(-lambda_*(k+1))
|
||||
|
||||
def _sf(self, x, lambda_):
|
||||
return np.exp(self._logsf(x, lambda_))
|
||||
|
||||
def _logsf(self, x, lambda_):
|
||||
k = floor(x)
|
||||
return -lambda_*(k+1)
|
||||
|
||||
def _ppf(self, q, lambda_):
|
||||
vals = ceil(-1.0/lambda_ * log1p(-q)-1)
|
||||
vals1 = (vals-1).clip(self.a, np.inf)
|
||||
temp = self._cdf(vals1, lambda_)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, lambda_):
|
||||
mu = 1/(exp(lambda_)-1)
|
||||
var = exp(-lambda_)/(expm1(-lambda_))**2
|
||||
g1 = 2*cosh(lambda_/2.0)
|
||||
g2 = 4+2*cosh(lambda_)
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _entropy(self, lambda_):
|
||||
l = lambda_
|
||||
C = (1-exp(-l))
|
||||
return l*exp(-l)/C - log(C)
|
||||
|
||||
|
||||
planck = planck_gen(a=0, name='planck', longname='A discrete exponential ')
|
||||
|
||||
|
||||
class boltzmann_gen(rv_discrete):
|
||||
r"""A Boltzmann (Truncated Discrete Exponential) random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `boltzmann` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = (1-\exp(-\lambda)) \exp(-\lambda k) / (1-\exp(-\lambda N))
|
||||
|
||||
for :math:`k = 0,..., N-1`.
|
||||
|
||||
`boltzmann` takes :math:`\lambda > 0` and :math:`N > 0` as shape parameters.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _argcheck(self, lambda_, N):
|
||||
self.a = 0
|
||||
self.b = N - 1
|
||||
return (lambda_ > 0) & (N > 0)
|
||||
|
||||
def _pmf(self, k, lambda_, N):
|
||||
# boltzmann.pmf(k) =
|
||||
# (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N))
|
||||
fact = (1-exp(-lambda_))/(1-exp(-lambda_*N))
|
||||
return fact*exp(-lambda_*k)
|
||||
|
||||
def _cdf(self, x, lambda_, N):
|
||||
k = floor(x)
|
||||
return (1-exp(-lambda_*(k+1)))/(1-exp(-lambda_*N))
|
||||
|
||||
def _ppf(self, q, lambda_, N):
|
||||
qnew = q*(1-exp(-lambda_*N))
|
||||
vals = ceil(-1.0/lambda_ * log(1-qnew)-1)
|
||||
vals1 = (vals-1).clip(0.0, np.inf)
|
||||
temp = self._cdf(vals1, lambda_, N)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, lambda_, N):
|
||||
z = exp(-lambda_)
|
||||
zN = exp(-lambda_*N)
|
||||
mu = z/(1.0-z)-N*zN/(1-zN)
|
||||
var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2
|
||||
trm = (1-zN)/(1-z)
|
||||
trm2 = (z*trm**2 - N*N*zN)
|
||||
g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN)
|
||||
g1 = g1 / trm2**(1.5)
|
||||
g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN)
|
||||
g2 = g2 / trm2 / trm2
|
||||
return mu, var, g1, g2
|
||||
|
||||
|
||||
boltzmann = boltzmann_gen(name='boltzmann',
|
||||
longname='A truncated discrete exponential ')
|
||||
|
||||
|
||||
class randint_gen(rv_discrete):
|
||||
r"""A uniform discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `randint` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = \frac{1}{high - low}
|
||||
|
||||
for ``k = low, ..., high - 1``.
|
||||
|
||||
`randint` takes ``low`` and ``high`` as shape parameters.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _argcheck(self, low, high):
|
||||
self.a = low
|
||||
self.b = high - 1
|
||||
return (high > low)
|
||||
|
||||
def _pmf(self, k, low, high):
|
||||
# randint.pmf(k) = 1./(high - low)
|
||||
p = np.ones_like(k) / (high - low)
|
||||
return np.where((k >= low) & (k < high), p, 0.)
|
||||
|
||||
def _cdf(self, x, low, high):
|
||||
k = floor(x)
|
||||
return (k - low + 1.) / (high - low)
|
||||
|
||||
def _ppf(self, q, low, high):
|
||||
vals = ceil(q * (high - low) + low) - 1
|
||||
vals1 = (vals - 1).clip(low, high)
|
||||
temp = self._cdf(vals1, low, high)
|
||||
return np.where(temp >= q, vals1, vals)
|
||||
|
||||
def _stats(self, low, high):
|
||||
m2, m1 = np.asarray(high), np.asarray(low)
|
||||
mu = (m2 + m1 - 1.0) / 2
|
||||
d = m2 - m1
|
||||
var = (d*d - 1) / 12.0
|
||||
g1 = 0.0
|
||||
g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0)
|
||||
return mu, var, g1, g2
|
||||
|
||||
def _rvs(self, low, high):
|
||||
"""An array of *size* random integers >= ``low`` and < ``high``."""
|
||||
if self._size is not None:
|
||||
# Numpy's RandomState.randint() doesn't broadcast its arguments.
|
||||
# Use `broadcast_to()` to extend the shapes of low and high
|
||||
# up to self._size. Then we can use the numpy.vectorize'd
|
||||
# randint without needing to pass it a `size` argument.
|
||||
low = broadcast_to(low, self._size)
|
||||
high = broadcast_to(high, self._size)
|
||||
randint = np.vectorize(self._random_state.randint, otypes=[np.int_])
|
||||
return randint(low, high)
|
||||
|
||||
def _entropy(self, low, high):
|
||||
return log(high - low)
|
||||
|
||||
|
||||
randint = randint_gen(name='randint', longname='A discrete uniform '
|
||||
'(random integer)')
|
||||
|
||||
|
||||
# FIXME: problems sampling.
|
||||
class zipf_gen(rv_discrete):
|
||||
r"""A Zipf discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `zipf` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k, a) = \frac{1}{\zeta(a) k^a}
|
||||
|
||||
for :math:`k \ge 1`.
|
||||
|
||||
`zipf` takes :math:`a` as shape parameter. :math:`\zeta` is the
|
||||
Riemann zeta function (`scipy.special.zeta`)
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, a):
|
||||
return self._random_state.zipf(a, size=self._size)
|
||||
|
||||
def _argcheck(self, a):
|
||||
return a > 1
|
||||
|
||||
def _pmf(self, k, a):
|
||||
# zipf.pmf(k, a) = 1/(zeta(a) * k**a)
|
||||
Pk = 1.0 / special.zeta(a, 1) / k**a
|
||||
return Pk
|
||||
|
||||
def _munp(self, n, a):
|
||||
return _lazywhere(
|
||||
a > n + 1, (a, n),
|
||||
lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1),
|
||||
np.inf)
|
||||
|
||||
|
||||
zipf = zipf_gen(a=1, name='zipf', longname='A Zipf')
|
||||
|
||||
|
||||
class dlaplace_gen(rv_discrete):
|
||||
r"""A Laplacian discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
The probability mass function for `dlaplace` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = \tanh(a/2) \exp(-a |k|)
|
||||
|
||||
for integers :math:`k` and :math:`a > 0`.
|
||||
|
||||
`dlaplace` takes :math:`a` as shape parameter.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _pmf(self, k, a):
|
||||
# dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k))
|
||||
return tanh(a/2.0) * exp(-a * abs(k))
|
||||
|
||||
def _cdf(self, x, a):
|
||||
k = floor(x)
|
||||
f = lambda k, a: 1.0 - exp(-a * k) / (exp(a) + 1)
|
||||
f2 = lambda k, a: exp(a * (k+1)) / (exp(a) + 1)
|
||||
return _lazywhere(k >= 0, (k, a), f=f, f2=f2)
|
||||
|
||||
def _ppf(self, q, a):
|
||||
const = 1 + exp(a)
|
||||
vals = ceil(np.where(q < 1.0 / (1 + exp(-a)),
|
||||
log(q*const) / a - 1,
|
||||
-log((1-q) * const) / a))
|
||||
vals1 = vals - 1
|
||||
return np.where(self._cdf(vals1, a) >= q, vals1, vals)
|
||||
|
||||
def _stats(self, a):
|
||||
ea = exp(a)
|
||||
mu2 = 2.*ea/(ea-1.)**2
|
||||
mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4
|
||||
return 0., mu2, 0., mu4/mu2**2 - 3.
|
||||
|
||||
def _entropy(self, a):
|
||||
return a / sinh(a) - log(tanh(a/2.0))
|
||||
|
||||
|
||||
dlaplace = dlaplace_gen(a=-np.inf,
|
||||
name='dlaplace', longname='A discrete Laplacian')
|
||||
|
||||
|
||||
class skellam_gen(rv_discrete):
|
||||
r"""A Skellam discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
Probability distribution of the difference of two correlated or
|
||||
uncorrelated Poisson random variables.
|
||||
|
||||
Let :math:`k_1` and :math:`k_2` be two Poisson-distributed r.v. with
|
||||
expected values :math:`\lambda_1` and :math:`\lambda_2`. Then,
|
||||
:math:`k_1 - k_2` follows a Skellam distribution with parameters
|
||||
:math:`\mu_1 = \lambda_1 - \rho \sqrt{\lambda_1 \lambda_2}` and
|
||||
:math:`\mu_2 = \lambda_2 - \rho \sqrt{\lambda_1 \lambda_2}`, where
|
||||
:math:`\rho` is the correlation coefficient between :math:`k_1` and
|
||||
:math:`k_2`. If the two Poisson-distributed r.v. are independent then
|
||||
:math:`\rho = 0`.
|
||||
|
||||
Parameters :math:`\mu_1` and :math:`\mu_2` must be strictly positive.
|
||||
|
||||
For details see: https://en.wikipedia.org/wiki/Skellam_distribution
|
||||
|
||||
`skellam` takes :math:`\mu_1` and :math:`\mu_2` as shape parameters.
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, mu1, mu2):
|
||||
n = self._size
|
||||
return (self._random_state.poisson(mu1, n) -
|
||||
self._random_state.poisson(mu2, n))
|
||||
|
||||
def _pmf(self, x, mu1, mu2):
|
||||
px = np.where(x < 0,
|
||||
_ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
|
||||
_ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
|
||||
# ncx2.pdf() returns nan's for extremely low probabilities
|
||||
return px
|
||||
|
||||
def _cdf(self, x, mu1, mu2):
|
||||
x = floor(x)
|
||||
px = np.where(x < 0,
|
||||
_ncx2_cdf(2*mu2, -2*x, 2*mu1),
|
||||
1 - _ncx2_cdf(2*mu1, 2*(x+1), 2*mu2))
|
||||
return px
|
||||
|
||||
def _stats(self, mu1, mu2):
|
||||
mean = mu1 - mu2
|
||||
var = mu1 + mu2
|
||||
g1 = mean / sqrt((var)**3)
|
||||
g2 = 1 / var
|
||||
return mean, var, g1, g2
|
||||
|
||||
|
||||
skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam')
|
||||
|
||||
|
||||
class yulesimon_gen(rv_discrete):
|
||||
r"""A Yule-Simon discrete random variable.
|
||||
|
||||
%(before_notes)s
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
The probability mass function for the `yulesimon` is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(k) = \alpha B(k, \alpha+1)
|
||||
|
||||
for :math:`k=1,2,3,...`, where :math:`\alpha>0`.
|
||||
Here :math:`B` refers to the `scipy.special.beta` function.
|
||||
|
||||
The sampling of random variates is based on pg 553, Section 6.3 of [1]_.
|
||||
Our notation maps to the referenced logic via :math:`\alpha=a-1`.
|
||||
|
||||
For details see the wikipedia entry [2]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Devroye, Luc. "Non-uniform Random Variate Generation",
|
||||
(1986) Springer, New York.
|
||||
|
||||
.. [2] https://en.wikipedia.org/wiki/Yule-Simon_distribution
|
||||
|
||||
%(after_notes)s
|
||||
|
||||
%(example)s
|
||||
|
||||
"""
|
||||
def _rvs(self, alpha):
|
||||
E1 = self._random_state.standard_exponential(self._size)
|
||||
E2 = self._random_state.standard_exponential(self._size)
|
||||
ans = ceil(-E1 / log1p(-exp(-E2 / alpha)))
|
||||
return ans
|
||||
|
||||
def _pmf(self, x, alpha):
|
||||
return alpha * special.beta(x, alpha + 1)
|
||||
|
||||
def _argcheck(self, alpha):
|
||||
return (alpha > 0)
|
||||
|
||||
def _logpmf(self, x, alpha):
|
||||
return log(alpha) + special.betaln(x, alpha + 1)
|
||||
|
||||
def _cdf(self, x, alpha):
|
||||
return 1 - x * special.beta(x, alpha + 1)
|
||||
|
||||
def _sf(self, x, alpha):
|
||||
return x * special.beta(x, alpha + 1)
|
||||
|
||||
def _logsf(self, x, alpha):
|
||||
return log(x) + special.betaln(x, alpha + 1)
|
||||
|
||||
def _stats(self, alpha):
|
||||
mu = np.where(alpha <= 1, np.inf, alpha / (alpha - 1))
|
||||
mu2 = np.where(alpha > 2,
|
||||
alpha**2 / ((alpha - 2.0) * (alpha - 1)**2),
|
||||
np.inf)
|
||||
mu2 = np.where(alpha <= 1, np.nan, mu2)
|
||||
g1 = np.where(alpha > 3,
|
||||
sqrt(alpha - 2) * (alpha + 1)**2 / (alpha * (alpha - 3)),
|
||||
np.inf)
|
||||
g1 = np.where(alpha <= 2, np.nan, g1)
|
||||
g2 = np.where(alpha > 4,
|
||||
(alpha + 3) + (alpha**3 - 49 * alpha - 22) / (alpha *
|
||||
(alpha - 4) * (alpha - 3)), np.inf)
|
||||
g2 = np.where(alpha <= 2, np.nan, g2)
|
||||
return mu, mu2, g1, g2
|
||||
|
||||
|
||||
yulesimon = yulesimon_gen(name='yulesimon', a=1)
|
||||
|
||||
|
||||
# Collect names of classes and objects in this module.
|
||||
pairs = list(globals().items())
|
||||
_distn_names, _distn_gen_names = get_distribution_names(pairs, rv_discrete)
|
||||
|
||||
__all__ = _distn_names + _distn_gen_names
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,130 +0,0 @@
|
||||
"""
|
||||
Sane parameters for stats.distributions.
|
||||
"""
|
||||
|
||||
distcont = [
|
||||
['alpha', (3.5704770516650459,)],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (1.0,)],
|
||||
['beta', (2.3098496451481823, 0.62687954300963677)],
|
||||
['betaprime', (5, 6)],
|
||||
['bradford', (0.29891359763170633,)],
|
||||
['burr', (10.5, 4.3)],
|
||||
['burr12', (10, 4)],
|
||||
['cauchy', ()],
|
||||
['chi', (78,)],
|
||||
['chi2', (55,)],
|
||||
['cosine', ()],
|
||||
['crystalball', (2.0, 3.0)],
|
||||
['dgamma', (1.1023326088288166,)],
|
||||
['dweibull', (2.0685080649914673,)],
|
||||
['erlang', (10,)],
|
||||
['expon', ()],
|
||||
['exponnorm', (1.5,)],
|
||||
['exponpow', (2.697119160358469,)],
|
||||
['exponweib', (2.8923945291034436, 1.9505288745913174)],
|
||||
['f', (29, 18)],
|
||||
['fatiguelife', (29,)], # correction numargs = 1
|
||||
['fisk', (3.0857548622253179,)],
|
||||
['foldcauchy', (4.7164673455831894,)],
|
||||
['foldnorm', (1.9521253373555869,)],
|
||||
['frechet_l', (3.6279911255583239,)],
|
||||
['frechet_r', (1.8928171603534227,)],
|
||||
['gamma', (1.9932305483800778,)],
|
||||
['gausshyper', (13.763771604130699, 3.1189636648681431,
|
||||
2.5145980350183019, 5.1811649903971615)], # veryslow
|
||||
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
|
||||
['genextreme', (-0.1,)],
|
||||
['gengamma', (4.4162385429431925, 3.1193091679242761)],
|
||||
['gengamma', (4.4162385429431925, -3.1193091679242761)],
|
||||
['genhalflogistic', (0.77274727809929322,)],
|
||||
['genlogistic', (0.41192440799679475,)],
|
||||
['gennorm', (1.2988442399460265,)],
|
||||
['halfgennorm', (0.6748054997000371,)],
|
||||
['genpareto', (0.1,)], # use case with finite moments
|
||||
['gilbrat', ()],
|
||||
['gompertz', (0.94743713075105251,)],
|
||||
['gumbel_l', ()],
|
||||
['gumbel_r', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (4.0668996136993067,)],
|
||||
['invgauss', (0.14546264555347513,)],
|
||||
['invweibull', (10.58,)],
|
||||
['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
|
||||
['johnsonsu', (2.554395574161155, 2.2482281679651965)],
|
||||
['kappa4', (0.0, 0.0)],
|
||||
['kappa4', (-0.1, 0.1)],
|
||||
['kappa4', (0.0, 0.1)],
|
||||
['kappa4', (0.1, 0.0)],
|
||||
['kappa3', (1.0,)],
|
||||
['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
|
||||
['kstwobign', ()],
|
||||
['laplace', ()],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (1.8, -0.5)],
|
||||
['loggamma', (0.41411931826052117,)],
|
||||
['logistic', ()],
|
||||
['loglaplace', (3.2505926592051435,)],
|
||||
['lognorm', (0.95368226960575331,)],
|
||||
['lomax', (1.8771398388773268,)],
|
||||
['maxwell', ()],
|
||||
['mielke', (10.4, 3.6)],
|
||||
['moyal', ()],
|
||||
['nakagami', (4.9673794866666237,)],
|
||||
['ncf', (27, 27, 0.41578441799226107)],
|
||||
['nct', (14, 0.24045031331198066)],
|
||||
['ncx2', (21, 1.0560465975116415)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (1., 0.5)],
|
||||
['pareto', (2.621716532144454,)],
|
||||
['pearson3', (0.1,)],
|
||||
['powerlaw', (1.6591133289905851,)],
|
||||
['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
|
||||
['powernorm', (4.4453652254590779,)],
|
||||
['rayleigh', ()],
|
||||
['rdist', (0.9,)], # feels also slow
|
||||
['recipinvgauss', (0.63004267809369119,)],
|
||||
['reciprocal', (0.0062309367010521255, 1.0062309367010522)],
|
||||
['rice', (0.7749725210111873,)],
|
||||
['semicircular', ()],
|
||||
['skewnorm', (4.0,)],
|
||||
['t', (2.7433514990818093,)],
|
||||
['trapz', (0.2, 0.8)],
|
||||
['triang', (0.15785029824528218,)],
|
||||
['truncexpon', (4.6907725456810478,)],
|
||||
['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
|
||||
['truncnorm', (0.1, 2.)],
|
||||
['tukeylambda', (3.1321477856738267,)],
|
||||
['uniform', ()],
|
||||
['vonmises', (3.9939042581071398,)],
|
||||
['vonmises_line', (3.9939042581071398,)],
|
||||
['wald', ()],
|
||||
['weibull_max', (2.8687961709100187,)],
|
||||
['weibull_min', (1.7866166930421596,)],
|
||||
['wrapcauchy', (0.031071279018614728,)]]
|
||||
|
||||
|
||||
distdiscrete = [
|
||||
['bernoulli',(0.3,)],
|
||||
['binom', (5, 0.4)],
|
||||
['boltzmann',(1.4, 19)],
|
||||
['dlaplace', (0.8,)], # 0.5
|
||||
['geom', (0.5,)],
|
||||
['hypergeom',(30, 12, 6)],
|
||||
['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
|
||||
['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
|
||||
['logser', (0.6,)], # re-enabled, numpy ticket:921
|
||||
['nbinom', (5, 0.5)],
|
||||
['nbinom', (0.4, 0.4)], # from tickets: 583
|
||||
['planck', (0.51,)], # 4.1
|
||||
['poisson', (0.6,)],
|
||||
['randint', (7, 31)],
|
||||
['skellam', (15, 8)],
|
||||
['zipf', (6.5,)],
|
||||
['yulesimon',(11.0,)]
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,169 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
from scipy._lib._util import check_random_state
|
||||
|
||||
|
||||
def rvs_ratio_uniforms(pdf, umax, vmin, vmax, size=1, c=0, random_state=None):
|
||||
"""
|
||||
Generate random samples from a probability density function using the
|
||||
ratio-of-uniforms method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdf : callable
|
||||
A function with signature `pdf(x)` that is the probability
|
||||
density function of the distribution.
|
||||
umax : float
|
||||
The upper bound of the bounding rectangle in the u-direction.
|
||||
vmin : float
|
||||
The lower bound of the bounding rectangle in the v-direction.
|
||||
vmax : float
|
||||
The upper bound of the bounding rectangle in the v-direction.
|
||||
size : int or tuple of ints, optional
|
||||
Defining number of random variates (default is 1).
|
||||
c : float, optional.
|
||||
Shift parameter of ratio-of-uniforms method, see Notes. Default is 0.
|
||||
random_state : int or np.random.RandomState instance, optional
|
||||
If already a RandomState instance, use it.
|
||||
If seed is an int, return a new RandomState instance seeded with seed.
|
||||
If None, use np.random.RandomState. Default is None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
rvs : ndarray
|
||||
The random variates distributed according to the probability
|
||||
distribution defined by the pdf.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Given a univariate probability density function `pdf` and a constant `c`,
|
||||
define the set ``A = {(u, v) : 0 < u <= sqrt(pdf(v/u + c))}``.
|
||||
If `(U, V)` is a random vector uniformly distributed over `A`,
|
||||
then `V/U + c` follows a distribution according to `pdf`.
|
||||
|
||||
The above result (see [1]_, [2]_) can be used to sample random variables
|
||||
using only the pdf, i.e. no inversion of the cdf is required. Typical
|
||||
choices of `c` are zero or the mode of `pdf`. The set `A` is a subset of
|
||||
the rectangle ``R = [0, umax] x [vmin, vmax]`` where
|
||||
|
||||
- ``umax = sup sqrt(pdf(x))``
|
||||
- ``vmin = inf (x - c) sqrt(pdf(x))``
|
||||
- ``vmax = sup (x - c) sqrt(pdf(x))``
|
||||
|
||||
In particular, these values are finite if `pdf` is bounded and
|
||||
``x**2 * pdf(x)`` is bounded (i.e. subquadratic tails).
|
||||
One can generate `(U, V)` uniformly on `R` and return
|
||||
`V/U + c` if `(U, V)` are also in `A` which can be directly
|
||||
verified.
|
||||
|
||||
Intuitively, the method works well if `A` fills up most of the
|
||||
enclosing rectangle such that the probability is high that `(U, V)`
|
||||
lies in `A` whenever it lies in `R` as the number of required
|
||||
iterations becomes too large otherwise. To be more precise, note that
|
||||
the expected number of iterations to draw `(U, V)` uniformly
|
||||
distributed on `R` such that `(U, V)` is also in `A` is given by
|
||||
the ratio ``area(R) / area(A) = 2 * umax * (vmax - vmin)``, using the fact
|
||||
that the area of `A` is equal to 1/2 (Theorem 7.1 in [1]_). A warning
|
||||
is displayed if this ratio is larger than 20. Moreover, if the sampling
|
||||
fails to generate a single random variate after 50000 iterations (i.e.
|
||||
not a single draw is in `A`), an exception is raised.
|
||||
|
||||
If the bounding rectangle is not correctly specified (i.e. if it does not
|
||||
contain `A`), the algorithm samples from a distribution different from
|
||||
the one given by `pdf`. It is therefore recommended to perform a
|
||||
test such as `stats.kstest` as a check.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] L. Devroye, "Non-Uniform Random Variate Generation",
|
||||
Springer-Verlag, 1986.
|
||||
|
||||
.. [2] W. Hoermann and J. Leydold, "Generating generalized inverse Gaussian
|
||||
random variates", Statistics and Computing, 24(4), p. 547--557, 2014.
|
||||
|
||||
.. [3] A.J. Kinderman and J.F. Monahan, "Computer Generation of Random
|
||||
Variables Using the Ratio of Uniform Deviates",
|
||||
ACM Transactions on Mathematical Software, 3(3), p. 257--260, 1977.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
|
||||
Simulate normally distributed random variables. It is easy to compute the
|
||||
bounding rectangle explicitly in that case.
|
||||
|
||||
>>> f = stats.norm.pdf
|
||||
>>> v_bound = np.sqrt(f(np.sqrt(2))) * np.sqrt(2)
|
||||
>>> umax, vmin, vmax = np.sqrt(f(0)), -v_bound, v_bound
|
||||
>>> np.random.seed(12345)
|
||||
>>> rvs = stats.rvs_ratio_uniforms(f, umax, vmin, vmax, size=2500)
|
||||
|
||||
The K-S test confirms that the random variates are indeed normally
|
||||
distributed (normality is not rejected at 5% significance level):
|
||||
|
||||
>>> stats.kstest(rvs, 'norm')[1]
|
||||
0.3420173467307603
|
||||
|
||||
The exponential distribution provides another example where the bounding
|
||||
rectangle can be determined explicitly.
|
||||
|
||||
>>> np.random.seed(12345)
|
||||
>>> rvs = stats.rvs_ratio_uniforms(lambda x: np.exp(-x), umax=1,
|
||||
... vmin=0, vmax=2*np.exp(-1), size=1000)
|
||||
>>> stats.kstest(rvs, 'expon')[1]
|
||||
0.928454552559516
|
||||
|
||||
Sometimes it can be helpful to use a non-zero shift parameter `c`, see e.g.
|
||||
[2]_ above in the case of the generalized inverse Gaussian distribution.
|
||||
|
||||
"""
|
||||
|
||||
if vmin >= vmax:
|
||||
raise ValueError("vmin must be smaller than vmax.")
|
||||
|
||||
if umax <= 0:
|
||||
raise ValueError("umax must be positive.")
|
||||
|
||||
exp_iter = 2 * (vmax - vmin) * umax # rejection constant (see [1])
|
||||
if exp_iter > 20:
|
||||
msg = ("The expected number of iterations to generate a single random "
|
||||
"number from the desired distribution is larger than {}, "
|
||||
"potentially causing bad performance.".format(int(exp_iter)))
|
||||
warnings.warn(msg, RuntimeWarning)
|
||||
|
||||
size1d = tuple(np.atleast_1d(size))
|
||||
N = np.prod(size1d) # number of rvs needed, reshape upon return
|
||||
|
||||
# start sampling using ratio of uniforms method
|
||||
rng = check_random_state(random_state)
|
||||
x = np.zeros(N)
|
||||
simulated, i = 0, 1
|
||||
|
||||
# loop until N rvs have been generated: expected runtime is finite
|
||||
# to avoid infinite loop, raise exception if not a single rv has been
|
||||
# generated after 50000 tries. even if exp_iter = 1000, probability of
|
||||
# this event is (1-1/1000)**50000 which is of order 10e-22
|
||||
while True:
|
||||
k = N - simulated
|
||||
# simulate uniform rvs on [0, umax] and [vmin, vmax]
|
||||
u1 = umax * rng.random_sample(size=k)
|
||||
v1 = vmin + (vmax - vmin) * rng.random_sample(size=k)
|
||||
# apply rejection method
|
||||
rvs = v1 / u1 + c
|
||||
accept = (u1**2 <= pdf(rvs))
|
||||
num_accept = np.sum(accept)
|
||||
if num_accept > 0:
|
||||
take = min(num_accept, N - simulated)
|
||||
x[simulated:(simulated + take)] = rvs[accept][0:take]
|
||||
simulated += take
|
||||
if simulated >= N:
|
||||
return np.reshape(x, size1d)
|
||||
if (simulated == 0) and (i*N >= 50000):
|
||||
msg = ("Not a single random variate could be generated in {} "
|
||||
"attempts. The ratio of uniforms method does not appear "
|
||||
"to work for the provided parameters. Please check the "
|
||||
"pdf and the bounds.".format(i*N))
|
||||
raise RuntimeError(msg)
|
||||
i += 1
|
||||
BIN
Binary file not shown.
@@ -1,389 +0,0 @@
|
||||
from collections import namedtuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from . import distributions
|
||||
|
||||
|
||||
__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
|
||||
|
||||
LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
|
||||
'rvalue', 'pvalue',
|
||||
'stderr'))
|
||||
|
||||
def linregress(x, y=None):
|
||||
"""
|
||||
Calculate a linear least-squares regression for two sets of measurements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array_like
|
||||
Two sets of measurements. Both arrays should have the same length.
|
||||
If only x is given (and y=None), then it must be a two-dimensional
|
||||
array where one dimension has length 2. The two sets of measurements
|
||||
are then found by splitting the array along the length-2 dimension.
|
||||
|
||||
Returns
|
||||
-------
|
||||
slope : float
|
||||
slope of the regression line
|
||||
intercept : float
|
||||
intercept of the regression line
|
||||
rvalue : float
|
||||
correlation coefficient
|
||||
pvalue : float
|
||||
two-sided p-value for a hypothesis test whose null hypothesis is
|
||||
that the slope is zero, using Wald Test with t-distribution of
|
||||
the test statistic.
|
||||
stderr : float
|
||||
Standard error of the estimated gradient.
|
||||
|
||||
See also
|
||||
--------
|
||||
:func:`scipy.optimize.curve_fit` : Use non-linear
|
||||
least squares to fit a function to data.
|
||||
:func:`scipy.optimize.leastsq` : Minimize the sum of
|
||||
squares of a set of equations.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from scipy import stats
|
||||
|
||||
Generate some data:
|
||||
|
||||
>>> np.random.seed(12345678)
|
||||
>>> x = np.random.random(10)
|
||||
>>> y = 1.6*x + np.random.random(10)
|
||||
|
||||
Perform the linear regression:
|
||||
|
||||
>>> slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
|
||||
>>> print("slope: %f intercept: %f" % (slope, intercept))
|
||||
slope: 1.944864 intercept: 0.268578
|
||||
|
||||
To get coefficient of determination (r_squared):
|
||||
|
||||
>>> print("r-squared: %f" % r_value**2)
|
||||
r-squared: 0.735498
|
||||
|
||||
Plot the data along with the fitted line:
|
||||
|
||||
>>> plt.plot(x, y, 'o', label='original data')
|
||||
>>> plt.plot(x, intercept + slope*x, 'r', label='fitted line')
|
||||
>>> plt.legend()
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
TINY = 1.0e-20
|
||||
if y is None: # x is a (2, N) or (N, 2) shaped array_like
|
||||
x = np.asarray(x)
|
||||
if x.shape[0] == 2:
|
||||
x, y = x
|
||||
elif x.shape[1] == 2:
|
||||
x, y = x.T
|
||||
else:
|
||||
msg = ("If only `x` is given as input, it has to be of shape "
|
||||
"(2, N) or (N, 2), provided shape was %s" % str(x.shape))
|
||||
raise ValueError(msg)
|
||||
else:
|
||||
x = np.asarray(x)
|
||||
y = np.asarray(y)
|
||||
|
||||
if x.size == 0 or y.size == 0:
|
||||
raise ValueError("Inputs must not be empty.")
|
||||
|
||||
n = len(x)
|
||||
xmean = np.mean(x, None)
|
||||
ymean = np.mean(y, None)
|
||||
|
||||
# average sum of squares:
|
||||
ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
|
||||
r_num = ssxym
|
||||
r_den = np.sqrt(ssxm * ssym)
|
||||
if r_den == 0.0:
|
||||
r = 0.0
|
||||
else:
|
||||
r = r_num / r_den
|
||||
# test for numerical error propagation
|
||||
if r > 1.0:
|
||||
r = 1.0
|
||||
elif r < -1.0:
|
||||
r = -1.0
|
||||
|
||||
df = n - 2
|
||||
slope = r_num / ssxm
|
||||
intercept = ymean - slope*xmean
|
||||
if n == 2:
|
||||
# handle case when only two points are passed in
|
||||
if y[0] == y[1]:
|
||||
prob = 1.0
|
||||
else:
|
||||
prob = 0.0
|
||||
sterrest = 0.0
|
||||
else:
|
||||
t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
|
||||
prob = 2 * distributions.t.sf(np.abs(t), df)
|
||||
sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
|
||||
|
||||
return LinregressResult(slope, intercept, r, prob, sterrest)
|
||||
|
||||
|
||||
def theilslopes(y, x=None, alpha=0.95):
|
||||
r"""
|
||||
Computes the Theil-Sen estimator for a set of points (x, y).
|
||||
|
||||
`theilslopes` implements a method for robust linear regression. It
|
||||
computes the slope as the median of all slopes between paired values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array_like
|
||||
Dependent variable.
|
||||
x : array_like or None, optional
|
||||
Independent variable. If None, use ``arange(len(y))`` instead.
|
||||
alpha : float, optional
|
||||
Confidence degree between 0 and 1. Default is 95% confidence.
|
||||
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
|
||||
interpreted as "find the 90% confidence interval".
|
||||
|
||||
Returns
|
||||
-------
|
||||
medslope : float
|
||||
Theil slope.
|
||||
medintercept : float
|
||||
Intercept of the Theil line, as ``median(y) - medslope*median(x)``.
|
||||
lo_slope : float
|
||||
Lower bound of the confidence interval on `medslope`.
|
||||
up_slope : float
|
||||
Upper bound of the confidence interval on `medslope`.
|
||||
|
||||
See also
|
||||
--------
|
||||
siegelslopes : a similar technique using repeated medians
|
||||
|
||||
Notes
|
||||
-----
|
||||
The implementation of `theilslopes` follows [1]_. The intercept is
|
||||
not defined in [1]_, and here it is defined as ``median(y) -
|
||||
medslope*median(x)``, which is given in [3]_. Other definitions of
|
||||
the intercept exist in the literature. A confidence interval for
|
||||
the intercept is not given as this question is not addressed in
|
||||
[1]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau",
|
||||
J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
|
||||
.. [2] H. Theil, "A rank-invariant method of linear and polynomial
|
||||
regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
|
||||
53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
|
||||
.. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
|
||||
John Wiley and Sons, New York, pp. 493.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> x = np.linspace(-5, 5, num=150)
|
||||
>>> y = x + np.random.normal(size=x.size)
|
||||
>>> y[11:15] += 10 # add outliers
|
||||
>>> y[-5:] -= 7
|
||||
|
||||
Compute the slope, intercept and 90% confidence interval. For comparison,
|
||||
also compute the least-squares fit with `linregress`:
|
||||
|
||||
>>> res = stats.theilslopes(y, x, 0.90)
|
||||
>>> lsq_res = stats.linregress(x, y)
|
||||
|
||||
Plot the results. The Theil-Sen regression line is shown in red, with the
|
||||
dashed red lines illustrating the confidence interval of the slope (note
|
||||
that the dashed red lines are not the confidence interval of the regression
|
||||
as the confidence interval of the intercept is not included). The green
|
||||
line shows the least-squares fit for comparison.
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, y, 'b.')
|
||||
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
||||
>>> ax.plot(x, res[1] + res[2] * x, 'r--')
|
||||
>>> ax.plot(x, res[1] + res[3] * x, 'r--')
|
||||
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
# We copy both x and y so we can use _find_repeats.
|
||||
y = np.array(y).flatten()
|
||||
if x is None:
|
||||
x = np.arange(len(y), dtype=float)
|
||||
else:
|
||||
x = np.array(x, dtype=float).flatten()
|
||||
if len(x) != len(y):
|
||||
raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
|
||||
|
||||
# Compute sorted slopes only when deltax > 0
|
||||
deltax = x[:, np.newaxis] - x
|
||||
deltay = y[:, np.newaxis] - y
|
||||
slopes = deltay[deltax > 0] / deltax[deltax > 0]
|
||||
slopes.sort()
|
||||
medslope = np.median(slopes)
|
||||
medinter = np.median(y) - medslope * np.median(x)
|
||||
# Now compute confidence intervals
|
||||
if alpha > 0.5:
|
||||
alpha = 1. - alpha
|
||||
|
||||
z = distributions.norm.ppf(alpha / 2.)
|
||||
# This implements (2.6) from Sen (1968)
|
||||
_, nxreps = _find_repeats(x)
|
||||
_, nyreps = _find_repeats(y)
|
||||
nt = len(slopes) # N in Sen (1968)
|
||||
ny = len(y) # n in Sen (1968)
|
||||
# Equation 2.6 in Sen (1968):
|
||||
sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
|
||||
sum(k * (k-1) * (2*k + 5) for k in nxreps) -
|
||||
sum(k * (k-1) * (2*k + 5) for k in nyreps))
|
||||
# Find the confidence interval indices in `slopes`
|
||||
sigma = np.sqrt(sigsq)
|
||||
Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
|
||||
Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
|
||||
delta = slopes[[Rl, Ru]]
|
||||
return medslope, medinter, delta[0], delta[1]
|
||||
|
||||
|
||||
def _find_repeats(arr):
|
||||
# This function assumes it may clobber its input.
|
||||
if len(arr) == 0:
|
||||
return np.array(0, np.float64), np.array(0, np.intp)
|
||||
|
||||
# XXX This cast was previously needed for the Fortran implementation,
|
||||
# should we ditch it?
|
||||
arr = np.asarray(arr, np.float64).ravel()
|
||||
arr.sort()
|
||||
|
||||
# Taken from NumPy 1.9's np.unique.
|
||||
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
||||
unique = arr[change]
|
||||
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
||||
freq = np.diff(change_idx)
|
||||
atleast2 = freq > 1
|
||||
return unique[atleast2], freq[atleast2]
|
||||
|
||||
|
||||
def siegelslopes(y, x=None, method="hierarchical"):
|
||||
r"""
|
||||
Computes the Siegel estimator for a set of points (x, y).
|
||||
|
||||
`siegelslopes` implements a method for robust linear regression
|
||||
using repeated medians (see [1]_) to fit a line to the points (x, y).
|
||||
The method is robust to outliers with an asymptotic breakdown point
|
||||
of 50%.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array_like
|
||||
Dependent variable.
|
||||
x : array_like or None, optional
|
||||
Independent variable. If None, use ``arange(len(y))`` instead.
|
||||
method : {'hierarchical', 'separate'}
|
||||
If 'hierarchical', estimate the intercept using the estimated
|
||||
slope ``medslope`` (default option).
|
||||
If 'separate', estimate the intercept independent of the estimated
|
||||
slope. See Notes for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
medslope : float
|
||||
Estimate of the slope of the regression line.
|
||||
medintercept : float
|
||||
Estimate of the intercept of the regression line.
|
||||
|
||||
See also
|
||||
--------
|
||||
theilslopes : a similar technique without repeated medians
|
||||
|
||||
Notes
|
||||
-----
|
||||
With ``n = len(y)``, compute ``m_j`` as the median of
|
||||
the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
|
||||
``medslope`` is then the median of all slopes ``m_j``.
|
||||
Two ways are given to estimate the intercept in [1]_ which can be chosen
|
||||
via the parameter ``method``.
|
||||
The hierarchical approach uses the estimated slope ``medslope``
|
||||
and computes ``medintercept`` as the median of ``y - medslope*x``.
|
||||
The other approach estimates the intercept separately as follows: for
|
||||
each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
|
||||
lines through the remaining points and take the median ``i_j``.
|
||||
``medintercept`` is the median of the ``i_j``.
|
||||
|
||||
The implementation computes `n` times the median of a vector of size `n`
|
||||
which can be slow for large vectors. There are more efficient algorithms
|
||||
(see [2]_) which are not implemented here.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Siegel, "Robust Regression Using Repeated Medians",
|
||||
Biometrika, Vol. 69, pp. 242-244, 1982.
|
||||
|
||||
.. [2] A. Stein and M. Werman, "Finding the repeated median regression
|
||||
line", Proceedings of the Third Annual ACM-SIAM Symposium on
|
||||
Discrete Algorithms, pp. 409–413, 1992.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> x = np.linspace(-5, 5, num=150)
|
||||
>>> y = x + np.random.normal(size=x.size)
|
||||
>>> y[11:15] += 10 # add outliers
|
||||
>>> y[-5:] -= 7
|
||||
|
||||
Compute the slope and intercept. For comparison, also compute the
|
||||
least-squares fit with `linregress`:
|
||||
|
||||
>>> res = stats.siegelslopes(y, x)
|
||||
>>> lsq_res = stats.linregress(x, y)
|
||||
|
||||
Plot the results. The Siegel regression line is shown in red. The green
|
||||
line shows the least-squares fit for comparison.
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, y, 'b.')
|
||||
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
||||
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if method not in ['hierarchical', 'separate']:
|
||||
raise ValueError("method can only be 'hierarchical' or 'separate'")
|
||||
y = np.asarray(y).ravel()
|
||||
if x is None:
|
||||
x = np.arange(len(y), dtype=float)
|
||||
else:
|
||||
x = np.asarray(x, dtype=float).ravel()
|
||||
if len(x) != len(y):
|
||||
raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
|
||||
|
||||
deltax = x[:, np.newaxis] - x
|
||||
deltay = y[:, np.newaxis] - y
|
||||
slopes, intercepts = [], []
|
||||
|
||||
for j in range(len(x)):
|
||||
id_nonzero = deltax[j, :] != 0
|
||||
slopes_j = deltay[j, id_nonzero] / deltax[j, id_nonzero]
|
||||
medslope_j = np.median(slopes_j)
|
||||
slopes.append(medslope_j)
|
||||
if method == 'separate':
|
||||
z = y*x[j] - y[j]*x
|
||||
medintercept_j = np.median(z[id_nonzero] / deltax[j, id_nonzero])
|
||||
intercepts.append(medintercept_j)
|
||||
|
||||
medslope = np.median(np.asarray(slopes))
|
||||
if method == "separate":
|
||||
medinter = np.median(np.asarray(intercepts))
|
||||
else:
|
||||
medinter = np.median(y - medslope*x)
|
||||
|
||||
return medslope, medinter
|
||||
@@ -1,201 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy import poly1d
|
||||
from scipy.special import beta
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
#
|
||||
# def mpvar(lam):
|
||||
# if lam == 0:
|
||||
# v = mp.pi**2 / three
|
||||
# else:
|
||||
# v = (two / lam**2) * (one / (one + two*lam) -
|
||||
# mp.beta(lam + one, lam + one))
|
||||
# return v
|
||||
#
|
||||
# t = mp.taylor(mpvar, 0, 8)
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print("p =", [mp.fp.mpf(c) for c in p])
|
||||
# print("q =", [mp.fp.mpf(c) for c in q])
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda variance function.
|
||||
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
|
||||
-0.5370742306855439, 0.17292046290190008,
|
||||
-0.02371146284628187]
|
||||
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
|
||||
1.7660926747377275, 0.2643989311168465]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda variance.
|
||||
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
|
||||
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_variance(lam):
|
||||
"""Variance of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.5, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.5, np.inf is returned.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In an interval around lambda=0, this function uses the [4,4] Pade
|
||||
approximation to compute the variance. Otherwise it uses the standard
|
||||
formula (https://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
|
||||
Pade approximation is used because the standard formula has a removable
|
||||
discontinuity at lambda = 0, and does not produce accurate numerical
|
||||
results near lambda = 0.
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.075
|
||||
|
||||
# Play games with masks to implement the conditional evaluation of
|
||||
# the distribution.
|
||||
# lambda < -0.5: var = nan
|
||||
low_mask = lam < -0.5
|
||||
# lambda == -0.5: var = inf
|
||||
neghalf_mask = lam == -0.5
|
||||
# abs(lambda) < threshold: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | neghalf_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
v = np.empty_like(lam)
|
||||
v[low_mask] = np.nan
|
||||
v[neghalf_mask] = np.inf
|
||||
if small.size > 0:
|
||||
# Use the Pade approximation near lambda = 0.
|
||||
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
|
||||
if reg.size > 0:
|
||||
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
|
||||
beta(reg + 1, reg + 1))
|
||||
v.shape = shp
|
||||
return v
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
# three = mp.mpf(3)
|
||||
# four = mp.mpf(4)
|
||||
#
|
||||
# def mpkurt(lam):
|
||||
# if lam == 0:
|
||||
# k = mp.mpf(6)/5
|
||||
# else:
|
||||
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
|
||||
# three*mp.beta(two*lam+one, two*lam+one))
|
||||
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
|
||||
# k = numer / denom - three
|
||||
# return k
|
||||
#
|
||||
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
|
||||
# # taylor function and we request a degree 9 Taylor polynomial, we actually
|
||||
# # get degree 8.
|
||||
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
|
||||
# t = [mp.chop(c, tol=1e-15) for c in t]
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print("p =", [mp.fp.mpf(c) for c in p])
|
||||
# print("q =", [mp.fp.mpf(c) for c in q])
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda kurtosis function.
|
||||
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
|
||||
0.20601184383406815, 4.59796302262789]
|
||||
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
|
||||
0.43075235247853005, -2.789746758009912]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda kurtosis.
|
||||
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
|
||||
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_kurtosis(lam):
|
||||
"""Kurtosis of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.25, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.25, np.inf is returned.
|
||||
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.055
|
||||
|
||||
# Use masks to implement the conditional evaluation of the kurtosis.
|
||||
# lambda < -0.25: kurtosis = nan
|
||||
low_mask = lam < -0.25
|
||||
# lambda == -0.25: kurtosis = inf
|
||||
negqrtr_mask = lam == -0.25
|
||||
# lambda near 0: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
k = np.empty_like(lam)
|
||||
k[low_mask] = np.nan
|
||||
k[negqrtr_mask] = np.inf
|
||||
if small.size > 0:
|
||||
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
|
||||
if reg.size > 0:
|
||||
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
|
||||
3 * beta(2 * reg + 1, 2 * reg + 1))
|
||||
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
|
||||
k[reg_mask] = numer / denom - 3
|
||||
|
||||
# The return value will be a numpy array; resetting the shape ensures that
|
||||
# if `lam` was a scalar, the return value is a 0-d array.
|
||||
k.shape = shp
|
||||
return k
|
||||
@@ -1,274 +0,0 @@
|
||||
"""Some functions for working with contingency tables (i.e. cross tabulations).
|
||||
"""
|
||||
|
||||
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from functools import reduce
|
||||
import numpy as np
|
||||
from .stats import power_divergence
|
||||
|
||||
|
||||
__all__ = ['margins', 'expected_freq', 'chi2_contingency']
|
||||
|
||||
|
||||
def margins(a):
|
||||
"""Return a list of the marginal sums of the array `a`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : ndarray
|
||||
The array for which to compute the marginal sums.
|
||||
|
||||
Returns
|
||||
-------
|
||||
margsums : list of ndarrays
|
||||
A list of length `a.ndim`. `margsums[k]` is the result
|
||||
of summing `a` over all axes except `k`; it has the same
|
||||
number of dimensions as `a`, but the length of each axis
|
||||
except axis `k` will be 1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> a = np.arange(12).reshape(2, 6)
|
||||
>>> a
|
||||
array([[ 0, 1, 2, 3, 4, 5],
|
||||
[ 6, 7, 8, 9, 10, 11]])
|
||||
>>> m0, m1 = margins(a)
|
||||
>>> m0
|
||||
array([[15],
|
||||
[51]])
|
||||
>>> m1
|
||||
array([[ 6, 8, 10, 12, 14, 16]])
|
||||
|
||||
>>> b = np.arange(24).reshape(2,3,4)
|
||||
>>> m0, m1, m2 = margins(b)
|
||||
>>> m0
|
||||
array([[[ 66]],
|
||||
[[210]]])
|
||||
>>> m1
|
||||
array([[[ 60],
|
||||
[ 92],
|
||||
[124]]])
|
||||
>>> m2
|
||||
array([[[60, 66, 72, 78]]])
|
||||
"""
|
||||
margsums = []
|
||||
ranged = list(range(a.ndim))
|
||||
for k in ranged:
|
||||
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
|
||||
margsums.append(marg)
|
||||
return margsums
|
||||
|
||||
|
||||
def expected_freq(observed):
|
||||
"""
|
||||
Compute the expected frequencies from a contingency table.
|
||||
|
||||
Given an n-dimensional contingency table of observed frequencies,
|
||||
compute the expected frequencies for the table based on the marginal
|
||||
sums under the assumption that the groups associated with each
|
||||
dimension are independent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
observed : array_like
|
||||
The table of observed frequencies. (While this function can handle
|
||||
a 1-D array, that case is trivial. Generally `observed` is at
|
||||
least 2-D.)
|
||||
|
||||
Returns
|
||||
-------
|
||||
expected : ndarray of float64
|
||||
The expected frequencies, based on the marginal sums of the table.
|
||||
Same shape as `observed`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
|
||||
>>> from scipy.stats import expected_freq
|
||||
>>> expected_freq(observed)
|
||||
array([[ 12., 12., 16.],
|
||||
[ 18., 18., 24.]])
|
||||
|
||||
"""
|
||||
# Typically `observed` is an integer array. If `observed` has a large
|
||||
# number of dimensions or holds large values, some of the following
|
||||
# computations may overflow, so we first switch to floating point.
|
||||
observed = np.asarray(observed, dtype=np.float64)
|
||||
|
||||
# Create a list of the marginal sums.
|
||||
margsums = margins(observed)
|
||||
|
||||
# Create the array of expected frequencies. The shapes of the
|
||||
# marginal sums returned by apply_over_axes() are just what we
|
||||
# need for broadcasting in the following product.
|
||||
d = observed.ndim
|
||||
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
|
||||
return expected
|
||||
|
||||
|
||||
def chi2_contingency(observed, correction=True, lambda_=None):
|
||||
"""Chi-square test of independence of variables in a contingency table.
|
||||
|
||||
This function computes the chi-square statistic and p-value for the
|
||||
hypothesis test of independence of the observed frequencies in the
|
||||
contingency table [1]_ `observed`. The expected frequencies are computed
|
||||
based on the marginal sums under the assumption of independence; see
|
||||
`scipy.stats.contingency.expected_freq`. The number of degrees of
|
||||
freedom is (expressed using numpy functions and attributes)::
|
||||
|
||||
dof = observed.size - sum(observed.shape) + observed.ndim - 1
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
observed : array_like
|
||||
The contingency table. The table contains the observed frequencies
|
||||
(i.e. number of occurrences) in each category. In the two-dimensional
|
||||
case, the table is often described as an "R x C table".
|
||||
correction : bool, optional
|
||||
If True, *and* the degrees of freedom is 1, apply Yates' correction
|
||||
for continuity. The effect of the correction is to adjust each
|
||||
observed value by 0.5 towards the corresponding expected value.
|
||||
lambda_ : float or str, optional.
|
||||
By default, the statistic computed in this test is Pearson's
|
||||
chi-squared statistic [2]_. `lambda_` allows a statistic from the
|
||||
Cressie-Read power divergence family [3]_ to be used instead. See
|
||||
`power_divergence` for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
chi2 : float
|
||||
The test statistic.
|
||||
p : float
|
||||
The p-value of the test
|
||||
dof : int
|
||||
Degrees of freedom
|
||||
expected : ndarray, same shape as `observed`
|
||||
The expected frequencies, based on the marginal sums of the table.
|
||||
|
||||
See Also
|
||||
--------
|
||||
contingency.expected_freq
|
||||
fisher_exact
|
||||
chisquare
|
||||
power_divergence
|
||||
|
||||
Notes
|
||||
-----
|
||||
An often quoted guideline for the validity of this calculation is that
|
||||
the test should be used only if the observed and expected frequencies
|
||||
in each cell are at least 5.
|
||||
|
||||
This is a test for the independence of different categories of a
|
||||
population. The test is only meaningful when the dimension of
|
||||
`observed` is two or more. Applying the test to a one-dimensional
|
||||
table will always result in `expected` equal to `observed` and a
|
||||
chi-square statistic equal to 0.
|
||||
|
||||
This function does not handle masked arrays, because the calculation
|
||||
does not make sense with missing values.
|
||||
|
||||
Like stats.chisquare, this function computes a chi-square statistic;
|
||||
the convenience this function provides is to figure out the expected
|
||||
frequencies and degrees of freedom from the given contingency table.
|
||||
If these were already known, and if the Yates' correction was not
|
||||
required, one could use stats.chisquare. That is, if one calls::
|
||||
|
||||
chi2, p, dof, ex = chi2_contingency(obs, correction=False)
|
||||
|
||||
then the following is true::
|
||||
|
||||
(chi2, p) == stats.chisquare(obs.ravel(), f_exp=ex.ravel(),
|
||||
ddof=obs.size - 1 - dof)
|
||||
|
||||
The `lambda_` argument was added in version 0.13.0 of scipy.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table",
|
||||
https://en.wikipedia.org/wiki/Contingency_table
|
||||
.. [2] "Pearson's chi-squared test",
|
||||
https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
|
||||
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
|
||||
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
|
||||
pp. 440-464.
|
||||
|
||||
Examples
|
||||
--------
|
||||
A two-way example (2 x 3):
|
||||
|
||||
>>> from scipy.stats import chi2_contingency
|
||||
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
|
||||
>>> chi2_contingency(obs)
|
||||
(2.7777777777777777,
|
||||
0.24935220877729619,
|
||||
2,
|
||||
array([[ 12., 12., 16.],
|
||||
[ 18., 18., 24.]]))
|
||||
|
||||
Perform the test using the log-likelihood ratio (i.e. the "G-test")
|
||||
instead of Pearson's chi-squared statistic.
|
||||
|
||||
>>> g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
|
||||
>>> g, p
|
||||
(2.7688587616781319, 0.25046668010954165)
|
||||
|
||||
A four-way example (2 x 2 x 2 x 2):
|
||||
|
||||
>>> obs = np.array(
|
||||
... [[[[12, 17],
|
||||
... [11, 16]],
|
||||
... [[11, 12],
|
||||
... [15, 16]]],
|
||||
... [[[23, 15],
|
||||
... [30, 22]],
|
||||
... [[14, 17],
|
||||
... [15, 16]]]])
|
||||
>>> chi2_contingency(obs)
|
||||
(8.7584514426741897,
|
||||
0.64417725029295503,
|
||||
11,
|
||||
array([[[[ 14.15462386, 14.15462386],
|
||||
[ 16.49423111, 16.49423111]],
|
||||
[[ 11.2461395 , 11.2461395 ],
|
||||
[ 13.10500554, 13.10500554]]],
|
||||
[[[ 19.5591166 , 19.5591166 ],
|
||||
[ 22.79202844, 22.79202844]],
|
||||
[[ 15.54012004, 15.54012004],
|
||||
[ 18.10873492, 18.10873492]]]]))
|
||||
"""
|
||||
observed = np.asarray(observed)
|
||||
if np.any(observed < 0):
|
||||
raise ValueError("All values in `observed` must be nonnegative.")
|
||||
if observed.size == 0:
|
||||
raise ValueError("No data; `observed` has size 0.")
|
||||
|
||||
expected = expected_freq(observed)
|
||||
if np.any(expected == 0):
|
||||
# Include one of the positions where expected is zero in
|
||||
# the exception message.
|
||||
zeropos = list(zip(*np.nonzero(expected == 0)))[0]
|
||||
raise ValueError("The internally computed table of expected "
|
||||
"frequencies has a zero element at %s." % (zeropos,))
|
||||
|
||||
# The degrees of freedom
|
||||
dof = expected.size - sum(expected.shape) + expected.ndim - 1
|
||||
|
||||
if dof == 0:
|
||||
# Degenerate case; this occurs when `observed` is 1D (or, more
|
||||
# generally, when it has only one nontrivial dimension). In this
|
||||
# case, we also have observed == expected, so chi2 is 0.
|
||||
chi2 = 0.0
|
||||
p = 1.0
|
||||
else:
|
||||
if dof == 1 and correction:
|
||||
# Adjust `observed` according to Yates' correction for continuity.
|
||||
observed = observed + 0.5 * np.sign(expected - observed)
|
||||
|
||||
chi2, p = power_divergence(observed, expected,
|
||||
ddof=observed.size - 1 - dof, axis=None,
|
||||
lambda_=lambda_)
|
||||
|
||||
return chi2, p, dof, expected
|
||||
@@ -1,24 +0,0 @@
|
||||
#
|
||||
# Author: Travis Oliphant 2002-2011 with contributions from
|
||||
# SciPy Developers 2004-2011
|
||||
#
|
||||
# NOTE: To look at history using `git blame`, use `git blame -M -C -C`
|
||||
# instead of `git blame -Lxxx,+x`.
|
||||
#
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from ._distn_infrastructure import (entropy, rv_discrete, rv_continuous,
|
||||
rv_frozen)
|
||||
|
||||
from . import _continuous_distns
|
||||
from . import _discrete_distns
|
||||
|
||||
from ._continuous_distns import *
|
||||
from ._discrete_distns import *
|
||||
|
||||
# For backwards compatibility e.g. pymc expects distributions.__all__.
|
||||
__all__ = ['entropy', 'rv_discrete', 'rv_continuous', 'rv_histogram']
|
||||
|
||||
# Add only the distribution names, not the *_gen names.
|
||||
__all__ += _continuous_distns._distn_names
|
||||
__all__ += _discrete_distns._distn_names
|
||||
@@ -1,625 +0,0 @@
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Define classes for (uni/multi)-variate kernel density estimation.
|
||||
#
|
||||
# Currently, only Gaussian kernels are implemented.
|
||||
#
|
||||
# Written by: Robert Kern
|
||||
#
|
||||
# Date: 2004-08-09
|
||||
#
|
||||
# Modified: 2005-02-10 by Robert Kern.
|
||||
# Contributed to Scipy
|
||||
# 2005-10-07 by Robert Kern.
|
||||
# Some fixes to match the new scipy_core
|
||||
#
|
||||
# Copyright 2004-2005 by Enthought, Inc.
|
||||
#
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
# Standard library imports.
|
||||
import warnings
|
||||
|
||||
# Scipy imports.
|
||||
from scipy._lib.six import callable, string_types
|
||||
from scipy import linalg, special
|
||||
from scipy.special import logsumexp
|
||||
from scipy._lib._numpy_compat import cov
|
||||
|
||||
from numpy import (atleast_2d, reshape, zeros, newaxis, dot, exp, pi, sqrt,
|
||||
ravel, power, atleast_1d, squeeze, sum, transpose, ones)
|
||||
import numpy as np
|
||||
from numpy.random import choice, multivariate_normal
|
||||
|
||||
# Local imports.
|
||||
from . import mvn
|
||||
|
||||
|
||||
__all__ = ['gaussian_kde']
|
||||
|
||||
|
||||
class gaussian_kde(object):
|
||||
"""Representation of a kernel-density estimate using Gaussian kernels.
|
||||
|
||||
Kernel density estimation is a way to estimate the probability density
|
||||
function (PDF) of a random variable in a non-parametric way.
|
||||
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
||||
includes automatic bandwidth determination. The estimation works best for
|
||||
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
||||
oversmoothed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : array_like
|
||||
Datapoints to estimate from. In case of univariate data this is a 1-D
|
||||
array, otherwise a 2-D array with shape (# of dims, # of data).
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
||||
this will be used directly as `kde.factor`. If a callable, it should
|
||||
take a `gaussian_kde` instance as only parameter and return a scalar.
|
||||
If None (default), 'scott' is used. See Notes for more details.
|
||||
weights : array_like, optional
|
||||
weights of datapoints. This must be the same shape as dataset.
|
||||
If None (default), the samples are assumed to be equally weighted
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dataset : ndarray
|
||||
The dataset with which `gaussian_kde` was initialized.
|
||||
d : int
|
||||
Number of dimensions.
|
||||
n : int
|
||||
Number of datapoints.
|
||||
neff : int
|
||||
Effective number of datapoints.
|
||||
|
||||
.. versionadded:: 1.2.0
|
||||
factor : float
|
||||
The bandwidth factor, obtained from `kde.covariance_factor`, with which
|
||||
the covariance matrix is multiplied.
|
||||
covariance : ndarray
|
||||
The covariance matrix of `dataset`, scaled by the calculated bandwidth
|
||||
(`kde.factor`).
|
||||
inv_cov : ndarray
|
||||
The inverse of `covariance`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
evaluate
|
||||
__call__
|
||||
integrate_gaussian
|
||||
integrate_box_1d
|
||||
integrate_box
|
||||
integrate_kde
|
||||
pdf
|
||||
logpdf
|
||||
resample
|
||||
set_bandwidth
|
||||
covariance_factor
|
||||
|
||||
Notes
|
||||
-----
|
||||
Bandwidth selection strongly influences the estimate obtained from the KDE
|
||||
(much more so than the actual shape of the kernel). Bandwidth selection
|
||||
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
||||
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
||||
uses a rule of thumb, the default is Scott's Rule.
|
||||
|
||||
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
||||
|
||||
n**(-1./(d+4)),
|
||||
|
||||
with ``n`` the number of data points and ``d`` the number of dimensions.
|
||||
In the case of unequally weighted points, `scotts_factor` becomes::
|
||||
|
||||
neff**(-1./(d+4)),
|
||||
|
||||
with ``neff`` the effective number of datapoints.
|
||||
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
|
||||
|
||||
(n * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
or in the case of unequally weighted points::
|
||||
|
||||
(neff * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
Good general descriptions of kernel density estimation can be found in [1]_
|
||||
and [2]_, the mathematics for this multi-dimensional implementation can be
|
||||
found in [1]_.
|
||||
|
||||
With a set of weighted samples, the effective number of datapoints ``neff``
|
||||
is defined by::
|
||||
|
||||
neff = sum(weights)^2 / sum(weights^2)
|
||||
|
||||
as detailed in [5]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
||||
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
||||
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
||||
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
||||
Chapman and Hall, London, 1986.
|
||||
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
||||
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
||||
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
||||
conditional density estimation", Computational Statistics & Data
|
||||
Analysis, Vol. 36, pp. 279-298, 2001.
|
||||
.. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
|
||||
Series A (General), 132, 272
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate some random two-dimensional data:
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> def measure(n):
|
||||
... "Measurement model, return two coupled measurements."
|
||||
... m1 = np.random.normal(size=n)
|
||||
... m2 = np.random.normal(scale=0.5, size=n)
|
||||
... return m1+m2, m1-m2
|
||||
|
||||
>>> m1, m2 = measure(2000)
|
||||
>>> xmin = m1.min()
|
||||
>>> xmax = m1.max()
|
||||
>>> ymin = m2.min()
|
||||
>>> ymax = m2.max()
|
||||
|
||||
Perform a kernel density estimate on the data:
|
||||
|
||||
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
||||
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
||||
>>> values = np.vstack([m1, m2])
|
||||
>>> kernel = stats.gaussian_kde(values)
|
||||
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
||||
|
||||
Plot the results:
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
||||
... extent=[xmin, xmax, ymin, ymax])
|
||||
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
||||
>>> ax.set_xlim([xmin, xmax])
|
||||
>>> ax.set_ylim([ymin, ymax])
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
def __init__(self, dataset, bw_method=None, weights=None):
|
||||
self.dataset = atleast_2d(dataset)
|
||||
if not self.dataset.size > 1:
|
||||
raise ValueError("`dataset` input should have multiple elements.")
|
||||
|
||||
self.d, self.n = self.dataset.shape
|
||||
|
||||
if weights is not None:
|
||||
self._weights = atleast_1d(weights)
|
||||
self._weights /= sum(self._weights)
|
||||
if self.weights.ndim != 1:
|
||||
raise ValueError("`weights` input should be one-dimensional.")
|
||||
if len(self._weights) != self.n:
|
||||
raise ValueError("`weights` input should be of length n")
|
||||
self._neff = 1/sum(self._weights**2)
|
||||
|
||||
self.set_bandwidth(bw_method=bw_method)
|
||||
|
||||
def evaluate(self, points):
|
||||
"""Evaluate the estimated pdf on a set of points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
points : (# of dimensions, # of points)-array
|
||||
Alternatively, a (# of dimensions,) vector can be passed in and
|
||||
treated as a single point.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : (# of points,)-array
|
||||
The values at each point.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : if the dimensionality of the input points is different than
|
||||
the dimensionality of the KDE.
|
||||
|
||||
"""
|
||||
points = atleast_2d(points)
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
||||
self.d)
|
||||
raise ValueError(msg)
|
||||
|
||||
result = zeros((m,), dtype=float)
|
||||
|
||||
whitening = linalg.cholesky(self.inv_cov)
|
||||
scaled_dataset = dot(whitening, self.dataset)
|
||||
scaled_points = dot(whitening, points)
|
||||
|
||||
if m >= self.n:
|
||||
# there are more points than data, so loop over data
|
||||
for i in range(self.n):
|
||||
diff = scaled_dataset[:, i, newaxis] - scaled_points
|
||||
energy = sum(diff * diff, axis=0) / 2.0
|
||||
result += self.weights[i]*exp(-energy)
|
||||
else:
|
||||
# loop over points
|
||||
for i in range(m):
|
||||
diff = scaled_dataset - scaled_points[:, i, newaxis]
|
||||
energy = sum(diff * diff, axis=0) / 2.0
|
||||
result[i] = sum(exp(-energy)*self.weights, axis=0)
|
||||
|
||||
result = result * self.n / self._norm_factor
|
||||
|
||||
return result
|
||||
|
||||
__call__ = evaluate
|
||||
|
||||
def integrate_gaussian(self, mean, cov):
|
||||
"""
|
||||
Multiply estimated density by a multivariate Gaussian and integrate
|
||||
over the whole space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mean : aray_like
|
||||
A 1-D array, specifying the mean of the Gaussian.
|
||||
cov : array_like
|
||||
A 2-D array, specifying the covariance matrix of the Gaussian.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : scalar
|
||||
The value of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the mean or covariance of the input Gaussian differs from
|
||||
the KDE's dimensionality.
|
||||
|
||||
"""
|
||||
mean = atleast_1d(squeeze(mean))
|
||||
cov = atleast_2d(cov)
|
||||
|
||||
if mean.shape != (self.d,):
|
||||
raise ValueError("mean does not have dimension %s" % self.d)
|
||||
if cov.shape != (self.d, self.d):
|
||||
raise ValueError("covariance does not have dimension %s" % self.d)
|
||||
|
||||
# make mean a column vector
|
||||
mean = mean[:, newaxis]
|
||||
|
||||
sum_cov = self.covariance + cov
|
||||
|
||||
# This will raise LinAlgError if the new cov matrix is not s.p.d
|
||||
# cho_factor returns (ndarray, bool) where bool is a flag for whether
|
||||
# or not ndarray is upper or lower triangular
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
|
||||
diff = self.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result = sum(exp(-energies)*self.weights, axis=0) / norm_const
|
||||
|
||||
return result
|
||||
|
||||
def integrate_box_1d(self, low, high):
|
||||
"""
|
||||
Computes the integral of a 1D pdf between two bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : scalar
|
||||
Lower bound of integration.
|
||||
high : scalar
|
||||
Upper bound of integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDE is over more than one dimension.
|
||||
|
||||
"""
|
||||
if self.d != 1:
|
||||
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
||||
|
||||
stdev = ravel(sqrt(self.covariance))[0]
|
||||
|
||||
normalized_low = ravel((low - self.dataset) / stdev)
|
||||
normalized_high = ravel((high - self.dataset) / stdev)
|
||||
|
||||
value = np.sum(self.weights*(
|
||||
special.ndtr(normalized_high) -
|
||||
special.ndtr(normalized_low)))
|
||||
return value
|
||||
|
||||
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
|
||||
"""Computes the integral of a pdf over a rectangular interval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low_bounds : array_like
|
||||
A 1-D array containing the lower bounds of integration.
|
||||
high_bounds : array_like
|
||||
A 1-D array containing the upper bounds of integration.
|
||||
maxpts : int, optional
|
||||
The maximum number of points to use for integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
"""
|
||||
if maxpts is not None:
|
||||
extra_kwds = {'maxpts': maxpts}
|
||||
else:
|
||||
extra_kwds = {}
|
||||
|
||||
value, inform = mvn.mvnun_weighted(low_bounds, high_bounds,
|
||||
self.dataset, self.weights,
|
||||
self.covariance, **extra_kwds)
|
||||
if inform:
|
||||
msg = ('An integral in mvn.mvnun requires more points than %s' %
|
||||
(self.d * 1000))
|
||||
warnings.warn(msg)
|
||||
|
||||
return value
|
||||
|
||||
def integrate_kde(self, other):
|
||||
"""
|
||||
Computes the integral of the product of this kernel density estimate
|
||||
with another.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : gaussian_kde instance
|
||||
The other kde.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDEs have different dimensionality.
|
||||
|
||||
"""
|
||||
if other.d != self.d:
|
||||
raise ValueError("KDEs are not the same dimensionality")
|
||||
|
||||
# we want to iterate over the smallest number of points
|
||||
if other.n < self.n:
|
||||
small = other
|
||||
large = self
|
||||
else:
|
||||
small = self
|
||||
large = other
|
||||
|
||||
sum_cov = small.covariance + large.covariance
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
result = 0.0
|
||||
for i in range(small.n):
|
||||
mean = small.dataset[:, i, newaxis]
|
||||
diff = large.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
result /= norm_const
|
||||
|
||||
return result
|
||||
|
||||
def resample(self, size=None):
|
||||
"""
|
||||
Randomly sample a dataset from the estimated pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, optional
|
||||
The number of samples to draw. If not provided, then the size is
|
||||
the same as the effective number of samples in the underlying
|
||||
dataset.
|
||||
|
||||
Returns
|
||||
-------
|
||||
resample : (self.d, `size`) ndarray
|
||||
The sampled dataset.
|
||||
|
||||
"""
|
||||
if size is None:
|
||||
size = int(self.neff)
|
||||
|
||||
norm = transpose(multivariate_normal(zeros((self.d,), float),
|
||||
self.covariance, size=size))
|
||||
indices = choice(self.n, size=size, p=self.weights)
|
||||
means = self.dataset[:, indices]
|
||||
|
||||
return means + norm
|
||||
|
||||
def scotts_factor(self):
|
||||
return power(self.neff, -1./(self.d+4))
|
||||
|
||||
def silverman_factor(self):
|
||||
return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
|
||||
|
||||
# Default method to calculate bandwidth, can be overwritten by subclass
|
||||
covariance_factor = scotts_factor
|
||||
covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
|
||||
multiplies the data covariance matrix to obtain the kernel covariance
|
||||
matrix. The default is `scotts_factor`. A subclass can overwrite this
|
||||
method to provide a different method, or set it through a call to
|
||||
`kde.set_bandwidth`."""
|
||||
|
||||
def set_bandwidth(self, bw_method=None):
|
||||
"""Compute the estimator bandwidth with given method.
|
||||
|
||||
The new bandwidth calculated after a call to `set_bandwidth` is used
|
||||
for subsequent evaluations of the estimated density.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a
|
||||
scalar, this will be used directly as `kde.factor`. If a callable,
|
||||
it should take a `gaussian_kde` instance as only parameter and
|
||||
return a scalar. If None (default), nothing happens; the current
|
||||
`kde.covariance_factor` method is kept.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 0.11
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import scipy.stats as stats
|
||||
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
||||
>>> kde = stats.gaussian_kde(x1)
|
||||
>>> xs = np.linspace(-10, 10, num=50)
|
||||
>>> y1 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method='silverman')
|
||||
>>> y2 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
||||
>>> y3 = kde(xs)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.plot(x1, np.ones(x1.shape) / (4. * x1.size), 'bo',
|
||||
... label='Data points (rescaled)')
|
||||
>>> ax.plot(xs, y1, label='Scott (default)')
|
||||
>>> ax.plot(xs, y2, label='Silverman')
|
||||
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
||||
>>> ax.legend()
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if bw_method is None:
|
||||
pass
|
||||
elif bw_method == 'scott':
|
||||
self.covariance_factor = self.scotts_factor
|
||||
elif bw_method == 'silverman':
|
||||
self.covariance_factor = self.silverman_factor
|
||||
elif np.isscalar(bw_method) and not isinstance(bw_method, string_types):
|
||||
self._bw_method = 'use constant'
|
||||
self.covariance_factor = lambda: bw_method
|
||||
elif callable(bw_method):
|
||||
self._bw_method = bw_method
|
||||
self.covariance_factor = lambda: self._bw_method(self)
|
||||
else:
|
||||
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
||||
"or a callable."
|
||||
raise ValueError(msg)
|
||||
|
||||
self._compute_covariance()
|
||||
|
||||
def _compute_covariance(self):
|
||||
"""Computes the covariance matrix for each Gaussian kernel using
|
||||
covariance_factor().
|
||||
"""
|
||||
self.factor = self.covariance_factor()
|
||||
# Cache covariance and inverse covariance of the data
|
||||
if not hasattr(self, '_data_inv_cov'):
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False,
|
||||
aweights=self.weights))
|
||||
self._data_inv_cov = linalg.inv(self._data_covariance)
|
||||
|
||||
self.covariance = self._data_covariance * self.factor**2
|
||||
self.inv_cov = self._data_inv_cov / self.factor**2
|
||||
self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) * self.n
|
||||
|
||||
def pdf(self, x):
|
||||
"""
|
||||
Evaluate the estimated pdf on a provided set of points.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
|
||||
docstring for more details.
|
||||
|
||||
"""
|
||||
return self.evaluate(x)
|
||||
|
||||
def logpdf(self, x):
|
||||
"""
|
||||
Evaluate the log of the estimated pdf on a provided set of points.
|
||||
"""
|
||||
|
||||
points = atleast_2d(x)
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
||||
self.d)
|
||||
raise ValueError(msg)
|
||||
|
||||
result = zeros((m,), dtype=float)
|
||||
|
||||
if m >= self.n:
|
||||
# there are more points than data, so loop over data
|
||||
energy = zeros((self.n, m), dtype=float)
|
||||
for i in range(self.n):
|
||||
diff = self.dataset[:, i, newaxis] - points
|
||||
tdiff = dot(self.inv_cov, diff)
|
||||
energy[i] = sum(diff*tdiff, axis=0) / 2.0
|
||||
result = logsumexp(-energy,
|
||||
b=self.weights[i]*self.n/self._norm_factor,
|
||||
axis=0)
|
||||
else:
|
||||
# loop over points
|
||||
for i in range(m):
|
||||
diff = self.dataset - points[:, i, newaxis]
|
||||
tdiff = dot(self.inv_cov, diff)
|
||||
energy = sum(diff * tdiff, axis=0) / 2.0
|
||||
result[i] = logsumexp(-energy,
|
||||
b=self.weights*self.n/self._norm_factor)
|
||||
|
||||
return result
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
try:
|
||||
return self._weights
|
||||
except AttributeError:
|
||||
self._weights = ones(self.n)/self.n
|
||||
return self._weights
|
||||
|
||||
@property
|
||||
def neff(self):
|
||||
try:
|
||||
return self._neff
|
||||
except AttributeError:
|
||||
self._neff = 1/sum(self.weights**2)
|
||||
return self._neff
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,135 +0,0 @@
|
||||
"""
|
||||
===================================================================
|
||||
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
|
||||
===================================================================
|
||||
|
||||
.. currentmodule:: scipy.stats.mstats
|
||||
|
||||
This module contains a large number of statistical functions that can
|
||||
be used with masked arrays.
|
||||
|
||||
Most of these functions are similar to those in `scipy.stats` but might
|
||||
have small differences in the API or in the algorithm used. Since this
|
||||
is a relatively new package, some API changes are still possible.
|
||||
|
||||
Summary statistics
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe
|
||||
gmean
|
||||
hmean
|
||||
kurtosis
|
||||
mode
|
||||
mquantiles
|
||||
hdmedian
|
||||
hdquantiles
|
||||
hdquantiles_sd
|
||||
idealfourths
|
||||
plotting_positions
|
||||
meppf
|
||||
moment
|
||||
skew
|
||||
tmean
|
||||
tvar
|
||||
tmin
|
||||
tmax
|
||||
tsem
|
||||
variation
|
||||
find_repeats
|
||||
sem
|
||||
trimmed_mean
|
||||
trimmed_mean_ci
|
||||
trimmed_std
|
||||
trimmed_var
|
||||
|
||||
Frequency statistics
|
||||
====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
scoreatpercentile
|
||||
|
||||
Correlation functions
|
||||
=====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
kendalltau_seasonal
|
||||
linregress
|
||||
siegelslopes
|
||||
theilslopes
|
||||
sen_seasonal_slopes
|
||||
|
||||
Statistical tests
|
||||
=================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
ttest_onesamp
|
||||
ttest_ind
|
||||
ttest_rel
|
||||
chisquare
|
||||
ks_2samp
|
||||
ks_twosamp
|
||||
mannwhitneyu
|
||||
rankdata
|
||||
kruskal
|
||||
kruskalwallis
|
||||
friedmanchisquare
|
||||
brunnermunzel
|
||||
skewtest
|
||||
kurtosistest
|
||||
normaltest
|
||||
|
||||
Transformations
|
||||
===============
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
obrientransform
|
||||
trim
|
||||
trima
|
||||
trimmed_stde
|
||||
trimr
|
||||
trimtail
|
||||
trimboth
|
||||
winsorize
|
||||
zmap
|
||||
zscore
|
||||
|
||||
Other
|
||||
=====
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
argstoarray
|
||||
count_tied_groups
|
||||
msign
|
||||
compare_medians_ms
|
||||
median_cihs
|
||||
mjci
|
||||
mquantiles_cimj
|
||||
rsh
|
||||
|
||||
"""
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from .mstats_basic import *
|
||||
from .mstats_extras import *
|
||||
# Functions that support masked array input in stats but need to be kept in the
|
||||
# mstats namespace for backwards compatibility:
|
||||
from scipy.stats import gmean, hmean, zmap, zscore, chisquare
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,477 +0,0 @@
|
||||
"""
|
||||
Additional statistics functions with support for masked arrays.
|
||||
|
||||
"""
|
||||
|
||||
# Original author (2007): Pierre GF Gerard-Marchant
|
||||
|
||||
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
|
||||
__all__ = ['compare_medians_ms',
|
||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||
'idealfourths',
|
||||
'median_cihs','mjci','mquantiles_cimj',
|
||||
'rsh',
|
||||
'trimmed_mean_ci',]
|
||||
|
||||
|
||||
import numpy as np
|
||||
from numpy import float_, int_, ndarray
|
||||
|
||||
import numpy.ma as ma
|
||||
from numpy.ma import MaskedArray
|
||||
|
||||
from . import mstats_basic as mstats
|
||||
|
||||
from scipy.stats.distributions import norm, beta, t, binom
|
||||
|
||||
|
||||
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
|
||||
"""
|
||||
Computes quantile estimates with the Harrell-Davis method.
|
||||
|
||||
The quantile estimates are calculated as a weighted linear combination
|
||||
of order statistics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles : MaskedArray
|
||||
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
||||
quantiles and variances (if `var` is True), where ``p`` is the
|
||||
number of quantiles.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles_sd
|
||||
|
||||
"""
|
||||
def _hd_1D(data,prob,var):
|
||||
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
||||
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
||||
# Don't use length here, in case we have a numpy scalar
|
||||
n = xsorted.size
|
||||
|
||||
hd = np.empty((2,len(prob)), float_)
|
||||
if n < 2:
|
||||
hd.flat = np.nan
|
||||
if var:
|
||||
return hd
|
||||
return hd[0]
|
||||
|
||||
v = np.arange(n+1) / float(n)
|
||||
betacdf = beta.cdf
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
hd_mean = np.dot(w, xsorted)
|
||||
hd[0,i] = hd_mean
|
||||
#
|
||||
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
||||
#
|
||||
hd[0, prob == 0] = xsorted[0]
|
||||
hd[0, prob == 1] = xsorted[-1]
|
||||
if var:
|
||||
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
||||
return hd
|
||||
return hd[0]
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None) or (data.ndim == 1):
|
||||
result = _hd_1D(data, p, var)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
||||
|
||||
return ma.fix_invalid(result, copy=False)
|
||||
|
||||
|
||||
def hdmedian(data, axis=-1, var=False):
|
||||
"""
|
||||
Returns the Harrell-Davis estimate of the median along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdmedian : MaskedArray
|
||||
The median values. If ``var=True``, the variance is returned inside
|
||||
the masked array. E.g. for a 1-D array the shape change from (1,) to
|
||||
(2,).
|
||||
|
||||
"""
|
||||
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
||||
return result.squeeze()
|
||||
|
||||
|
||||
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
|
||||
"""
|
||||
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles_sd : MaskedArray
|
||||
Standard error of the Harrell-Davis quantile estimates.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles
|
||||
|
||||
"""
|
||||
def _hdsd_1D(data, prob):
|
||||
"Computes the std error for 1D arrays."
|
||||
xsorted = np.sort(data.compressed())
|
||||
n = len(xsorted)
|
||||
|
||||
hdsd = np.empty(len(prob), float_)
|
||||
if n < 2:
|
||||
hdsd.flat = np.nan
|
||||
|
||||
vv = np.arange(n) / float(n-1)
|
||||
betacdf = beta.cdf
|
||||
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
|
||||
list(range(k+1,n))].astype(int_)])
|
||||
for k in range(n)], dtype=float_)
|
||||
mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
|
||||
hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
|
||||
return hdsd
|
||||
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _hdsd_1D(data, p)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
||||
|
||||
return ma.fix_invalid(result, copy=False).ravel()
|
||||
|
||||
|
||||
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
||||
alpha=0.05, axis=None):
|
||||
"""
|
||||
Selected confidence interval of the trimmed mean along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data.
|
||||
limits : {None, tuple}, optional
|
||||
None or a two item tuple.
|
||||
Tuple of the percentages to cut on each side of the array, with respect
|
||||
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
||||
is the number of unmasked data before trimming, then
|
||||
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
|
||||
largest data are masked. The total number of unmasked data after
|
||||
trimming is ``n * (1. - sum(limits))``.
|
||||
The value of one limit can be set to None to indicate an open interval.
|
||||
|
||||
Defaults to (0.2, 0.2).
|
||||
inclusive : (2,) tuple of boolean, optional
|
||||
If relative==False, tuple indicating whether values exactly equal to
|
||||
the absolute limits are allowed.
|
||||
If relative==True, tuple indicating whether the number of data being
|
||||
masked on each side should be rounded (True) or truncated (False).
|
||||
|
||||
Defaults to (True, True).
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
|
||||
Defaults to 0.05.
|
||||
axis : int, optional
|
||||
Axis along which to cut. If None, uses a flattened version of `data`.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
trimmed_mean_ci : (2,) ndarray
|
||||
The lower and upper confidence intervals of the trimmed data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
||||
tmean = trimmed.mean(axis)
|
||||
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
||||
df = trimmed.count(axis) - 1
|
||||
tppf = t.ppf(1-alpha/2.,df)
|
||||
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
||||
|
||||
|
||||
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
|
||||
"""
|
||||
Returns the Maritz-Jarrett estimators of the standard error of selected
|
||||
experimental quantiles of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
"""
|
||||
def _mjci_1D(data, p):
|
||||
data = np.sort(data.compressed())
|
||||
n = data.size
|
||||
prob = (np.array(p) * n + 0.5).astype(int_)
|
||||
betacdf = beta.cdf
|
||||
|
||||
mj = np.empty(len(prob), float_)
|
||||
x = np.arange(1,n+1, dtype=float_) / n
|
||||
y = x - 1./n
|
||||
for (i,m) in enumerate(prob):
|
||||
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
||||
C1 = np.dot(W,data)
|
||||
C2 = np.dot(W,data**2)
|
||||
mj[i] = np.sqrt(C2 - C1**2)
|
||||
return mj
|
||||
|
||||
data = ma.array(data, copy=False)
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
return _mjci_1D(data, p)
|
||||
else:
|
||||
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
||||
|
||||
|
||||
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha confidence interval for the selected quantiles of the
|
||||
data, with Maritz-Jarrett estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles.
|
||||
If None, use a flattened array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci_lower : ndarray
|
||||
The lower boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
ci_upper : ndarray
|
||||
The upper boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
|
||||
"""
|
||||
alpha = min(alpha, 1 - alpha)
|
||||
z = norm.ppf(1 - alpha/2.)
|
||||
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
||||
smj = mjci(data, prob, axis=axis)
|
||||
return (xq - z * smj, xq + z * smj)
|
||||
|
||||
|
||||
def median_cihs(data, alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha-level confidence interval for the median of the data.
|
||||
|
||||
Uses the Hettmasperger-Sheather method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data. Masked values are discarded. The input should be 1D only,
|
||||
or `axis` should be set to None.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
median_cihs
|
||||
Alpha level confidence interval.
|
||||
|
||||
"""
|
||||
def _cihs_1D(data, alpha):
|
||||
data = np.sort(data.compressed())
|
||||
n = len(data)
|
||||
alpha = min(alpha, 1-alpha)
|
||||
k = int(binom._ppf(alpha/2., n, 0.5))
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
if gk < 1-alpha:
|
||||
k -= 1
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
||||
I = (gk - 1 + alpha)/(gk - gkk)
|
||||
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
||||
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
||||
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
||||
return lims
|
||||
data = ma.array(data, copy=False)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _cihs_1D(data, alpha)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def compare_medians_ms(group_1, group_2, axis=None):
|
||||
"""
|
||||
Compares the medians from two independent groups along the given axis.
|
||||
|
||||
The comparison is performed using the McKean-Schrader estimate of the
|
||||
standard error of the medians.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
group_1 : array_like
|
||||
First dataset. Has to be of size >=7.
|
||||
group_2 : array_like
|
||||
Second dataset. Has to be of size >=7.
|
||||
axis : int, optional
|
||||
Axis along which the medians are estimated. If None, the arrays are
|
||||
flattened. If `axis` is not None, then `group_1` and `group_2`
|
||||
should have the same shape.
|
||||
|
||||
Returns
|
||||
-------
|
||||
compare_medians_ms : {float, ndarray}
|
||||
If `axis` is None, then returns a float, otherwise returns a 1-D
|
||||
ndarray of floats with a length equal to the length of `group_1`
|
||||
along `axis`.
|
||||
|
||||
"""
|
||||
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
||||
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
||||
mstats.stde_median(group_2, axis=axis))
|
||||
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
||||
return 1 - norm.cdf(W)
|
||||
|
||||
|
||||
def idealfourths(data, axis=None):
|
||||
"""
|
||||
Returns an estimate of the lower and upper quartiles.
|
||||
|
||||
Uses the ideal fourths algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input array.
|
||||
axis : int, optional
|
||||
Axis along which the quartiles are estimated. If None, the arrays are
|
||||
flattened.
|
||||
|
||||
Returns
|
||||
-------
|
||||
idealfourths : {list of floats, masked array}
|
||||
Returns the two internal values that divide `data` into four parts
|
||||
using the ideal fourths algorithm either along the flattened array
|
||||
(if `axis` is None) or along `axis` of `data`.
|
||||
|
||||
"""
|
||||
def _idf(data):
|
||||
x = data.compressed()
|
||||
n = len(x)
|
||||
if n < 3:
|
||||
return [np.nan,np.nan]
|
||||
(j,h) = divmod(n/4. + 5/12.,1)
|
||||
j = int(j)
|
||||
qlo = (1-h)*x[j-1] + h*x[j]
|
||||
k = n - j
|
||||
qup = (1-h)*x[k] + h*x[k-1]
|
||||
return [qlo, qup]
|
||||
data = ma.sort(data, axis=axis).view(MaskedArray)
|
||||
if (axis is None):
|
||||
return _idf(data)
|
||||
else:
|
||||
return ma.apply_along_axis(_idf, axis, data)
|
||||
|
||||
|
||||
def rsh(data, points=None):
|
||||
"""
|
||||
Evaluates Rosenblatt's shifted histogram estimators for each data point.
|
||||
|
||||
Rosenblatt's estimator is a centered finite-difference approximation to the
|
||||
derivative of the empirical cumulative distribution function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : sequence
|
||||
Input data, should be 1-D. Masked values are ignored.
|
||||
points : sequence or None, optional
|
||||
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
||||
If None, use the data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
if points is None:
|
||||
points = data
|
||||
else:
|
||||
points = np.array(points, copy=False, ndmin=1)
|
||||
|
||||
if data.ndim != 1:
|
||||
raise AttributeError("The input array should be 1D only !")
|
||||
|
||||
n = data.count()
|
||||
r = idealfourths(data, axis=None)
|
||||
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
||||
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
||||
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
||||
return (nhi-nlo) / (2.*n*h)
|
||||
BIN
Binary file not shown.
@@ -1,38 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from os.path import join
|
||||
|
||||
|
||||
def configuration(parent_package='',top_path=None):
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
config = Configuration('stats', parent_package, top_path)
|
||||
|
||||
config.add_data_dir('tests')
|
||||
|
||||
statlib_src = [join('statlib', '*.f')]
|
||||
config.add_library('statlib', sources=statlib_src)
|
||||
|
||||
# add statlib module
|
||||
config.add_extension('statlib',
|
||||
sources=['statlib.pyf'],
|
||||
f2py_options=['--no-wrap-functions'],
|
||||
libraries=['statlib'],
|
||||
depends=statlib_src
|
||||
)
|
||||
|
||||
# add _stats module
|
||||
config.add_extension('_stats',
|
||||
sources=['_stats.c'],
|
||||
)
|
||||
|
||||
# add mvn module
|
||||
config.add_extension('mvn',
|
||||
sources=['mvn.pyf','mvndst.f'],
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from numpy.distutils.core import setup
|
||||
setup(**configuration(top_path='').todict())
|
||||
BIN
Binary file not shown.
File diff suppressed because it is too large
Load Diff
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,306 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import numpy.testing as npt
|
||||
from numpy.testing import assert_allclose, assert_equal
|
||||
from scipy._lib._numpy_compat import suppress_warnings
|
||||
from pytest import raises as assert_raises
|
||||
|
||||
import numpy.ma.testutils as ma_npt
|
||||
|
||||
from scipy._lib._util import getargspec_no_self as _getargspec
|
||||
from scipy import stats
|
||||
|
||||
|
||||
def check_named_results(res, attributes, ma=False):
|
||||
for i, attr in enumerate(attributes):
|
||||
if ma:
|
||||
ma_npt.assert_equal(res[i], getattr(res, attr))
|
||||
else:
|
||||
npt.assert_equal(res[i], getattr(res, attr))
|
||||
|
||||
|
||||
def check_normalization(distfn, args, distname):
|
||||
norm_moment = distfn.moment(0, *args)
|
||||
npt.assert_allclose(norm_moment, 1.0)
|
||||
|
||||
# this is a temporary plug: either ncf or expect is problematic;
|
||||
# best be marked as a knownfail, but I've no clue how to do it.
|
||||
if distname == "ncf":
|
||||
atol, rtol = 1e-5, 0
|
||||
else:
|
||||
atol, rtol = 1e-7, 1e-7
|
||||
|
||||
normalization_expect = distfn.expect(lambda x: 1, args=args)
|
||||
npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol,
|
||||
err_msg=distname, verbose=True)
|
||||
|
||||
normalization_cdf = distfn.cdf(distfn.b, *args)
|
||||
npt.assert_allclose(normalization_cdf, 1.0)
|
||||
|
||||
|
||||
def check_moment(distfn, arg, m, v, msg):
|
||||
m1 = distfn.moment(1, *arg)
|
||||
m2 = distfn.moment(2, *arg)
|
||||
if not np.isinf(m):
|
||||
npt.assert_almost_equal(m1, m, decimal=10, err_msg=msg +
|
||||
' - 1st moment')
|
||||
else: # or np.isnan(m1),
|
||||
npt.assert_(np.isinf(m1),
|
||||
msg + ' - 1st moment -infinite, m1=%s' % str(m1))
|
||||
|
||||
if not np.isinf(v):
|
||||
npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10, err_msg=msg +
|
||||
' - 2ndt moment')
|
||||
else: # or np.isnan(m2),
|
||||
npt.assert_(np.isinf(m2),
|
||||
msg + ' - 2nd moment -infinite, m2=%s' % str(m2))
|
||||
|
||||
|
||||
def check_mean_expect(distfn, arg, m, msg):
|
||||
if np.isfinite(m):
|
||||
m1 = distfn.expect(lambda x: x, arg)
|
||||
npt.assert_almost_equal(m1, m, decimal=5, err_msg=msg +
|
||||
' - 1st moment (expect)')
|
||||
|
||||
|
||||
def check_var_expect(distfn, arg, m, v, msg):
|
||||
if np.isfinite(v):
|
||||
m2 = distfn.expect(lambda x: x*x, arg)
|
||||
npt.assert_almost_equal(m2, v + m*m, decimal=5, err_msg=msg +
|
||||
' - 2st moment (expect)')
|
||||
|
||||
|
||||
def check_skew_expect(distfn, arg, m, v, s, msg):
|
||||
if np.isfinite(s):
|
||||
m3e = distfn.expect(lambda x: np.power(x-m, 3), arg)
|
||||
npt.assert_almost_equal(m3e, s * np.power(v, 1.5),
|
||||
decimal=5, err_msg=msg + ' - skew')
|
||||
else:
|
||||
npt.assert_(np.isnan(s))
|
||||
|
||||
|
||||
def check_kurt_expect(distfn, arg, m, v, k, msg):
|
||||
if np.isfinite(k):
|
||||
m4e = distfn.expect(lambda x: np.power(x-m, 4), arg)
|
||||
npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2), atol=1e-5, rtol=1e-5,
|
||||
err_msg=msg + ' - kurtosis')
|
||||
elif not np.isposinf(k):
|
||||
npt.assert_(np.isnan(k))
|
||||
|
||||
|
||||
def check_entropy(distfn, arg, msg):
|
||||
ent = distfn.entropy(*arg)
|
||||
npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan')
|
||||
|
||||
|
||||
def check_private_entropy(distfn, args, superclass):
|
||||
# compare a generic _entropy with the distribution-specific implementation
|
||||
npt.assert_allclose(distfn._entropy(*args),
|
||||
superclass._entropy(distfn, *args))
|
||||
|
||||
|
||||
def check_entropy_vect_scale(distfn, arg):
|
||||
# check 2-d
|
||||
sc = np.asarray([[1, 2], [3, 4]])
|
||||
v_ent = distfn.entropy(*arg, scale=sc)
|
||||
s_ent = [distfn.entropy(*arg, scale=s) for s in sc.ravel()]
|
||||
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
|
||||
assert_allclose(v_ent, s_ent, atol=1e-14)
|
||||
|
||||
# check invalid value, check cast
|
||||
sc = [1, 2, -3]
|
||||
v_ent = distfn.entropy(*arg, scale=sc)
|
||||
s_ent = [distfn.entropy(*arg, scale=s) for s in sc]
|
||||
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
|
||||
assert_allclose(v_ent, s_ent, atol=1e-14)
|
||||
|
||||
|
||||
def check_edge_support(distfn, args):
|
||||
# Make sure that x=self.a and self.b are handled correctly.
|
||||
x = [distfn.a, distfn.b]
|
||||
if isinstance(distfn, stats.rv_discrete):
|
||||
x = [distfn.a - 1, distfn.b]
|
||||
|
||||
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
|
||||
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
|
||||
|
||||
if distfn.name not in ('skellam', 'dlaplace'):
|
||||
# with a = -inf, log(0) generates warnings
|
||||
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
|
||||
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
|
||||
|
||||
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
|
||||
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
|
||||
|
||||
# out-of-bounds for isf & ppf
|
||||
npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all())
|
||||
npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all())
|
||||
|
||||
|
||||
def check_named_args(distfn, x, shape_args, defaults, meths):
|
||||
## Check calling w/ named arguments.
|
||||
|
||||
# check consistency of shapes, numargs and _parse signature
|
||||
signature = _getargspec(distfn._parse_args)
|
||||
npt.assert_(signature.varargs is None)
|
||||
npt.assert_(signature.keywords is None)
|
||||
npt.assert_(list(signature.defaults) == list(defaults))
|
||||
|
||||
shape_argnames = signature.args[:-len(defaults)] # a, b, loc=0, scale=1
|
||||
if distfn.shapes:
|
||||
shapes_ = distfn.shapes.replace(',', ' ').split()
|
||||
else:
|
||||
shapes_ = ''
|
||||
npt.assert_(len(shapes_) == distfn.numargs)
|
||||
npt.assert_(len(shapes_) == len(shape_argnames))
|
||||
|
||||
# check calling w/ named arguments
|
||||
shape_args = list(shape_args)
|
||||
|
||||
vals = [meth(x, *shape_args) for meth in meths]
|
||||
npt.assert_(np.all(np.isfinite(vals)))
|
||||
|
||||
names, a, k = shape_argnames[:], shape_args[:], {}
|
||||
while names:
|
||||
k.update({names.pop(): a.pop()})
|
||||
v = [meth(x, *a, **k) for meth in meths]
|
||||
npt.assert_array_equal(vals, v)
|
||||
if 'n' not in k.keys():
|
||||
# `n` is first parameter of moment(), so can't be used as named arg
|
||||
npt.assert_equal(distfn.moment(1, *a, **k),
|
||||
distfn.moment(1, *shape_args))
|
||||
|
||||
# unknown arguments should not go through:
|
||||
k.update({'kaboom': 42})
|
||||
assert_raises(TypeError, distfn.cdf, x, **k)
|
||||
|
||||
|
||||
def check_random_state_property(distfn, args):
|
||||
# check the random_state attribute of a distribution *instance*
|
||||
|
||||
# This test fiddles with distfn.random_state. This breaks other tests,
|
||||
# hence need to save it and then restore.
|
||||
rndm = distfn.random_state
|
||||
|
||||
# baseline: this relies on the global state
|
||||
np.random.seed(1234)
|
||||
distfn.random_state = None
|
||||
r0 = distfn.rvs(*args, size=8)
|
||||
|
||||
# use an explicit instance-level random_state
|
||||
distfn.random_state = 1234
|
||||
r1 = distfn.rvs(*args, size=8)
|
||||
npt.assert_equal(r0, r1)
|
||||
|
||||
distfn.random_state = np.random.RandomState(1234)
|
||||
r2 = distfn.rvs(*args, size=8)
|
||||
npt.assert_equal(r0, r2)
|
||||
|
||||
# can override the instance-level random_state for an individual .rvs call
|
||||
distfn.random_state = 2
|
||||
orig_state = distfn.random_state.get_state()
|
||||
|
||||
r3 = distfn.rvs(*args, size=8, random_state=np.random.RandomState(1234))
|
||||
npt.assert_equal(r0, r3)
|
||||
|
||||
# ... and that does not alter the instance-level random_state!
|
||||
npt.assert_equal(distfn.random_state.get_state(), orig_state)
|
||||
|
||||
# finally, restore the random_state
|
||||
distfn.random_state = rndm
|
||||
|
||||
|
||||
def check_meth_dtype(distfn, arg, meths):
|
||||
q0 = [0.25, 0.5, 0.75]
|
||||
x0 = distfn.ppf(q0, *arg)
|
||||
x_cast = [x0.astype(tp) for tp in
|
||||
(np.int_, np.float16, np.float32, np.float64)]
|
||||
|
||||
for x in x_cast:
|
||||
# casting may have clipped the values, exclude those
|
||||
distfn._argcheck(*arg)
|
||||
x = x[(distfn.a < x) & (x < distfn.b)]
|
||||
for meth in meths:
|
||||
val = meth(x, *arg)
|
||||
npt.assert_(val.dtype == np.float_)
|
||||
|
||||
|
||||
def check_ppf_dtype(distfn, arg):
|
||||
q0 = np.asarray([0.25, 0.5, 0.75])
|
||||
q_cast = [q0.astype(tp) for tp in (np.float16, np.float32, np.float64)]
|
||||
for q in q_cast:
|
||||
for meth in [distfn.ppf, distfn.isf]:
|
||||
val = meth(q, *arg)
|
||||
npt.assert_(val.dtype == np.float_)
|
||||
|
||||
|
||||
def check_cmplx_deriv(distfn, arg):
|
||||
# Distributions allow complex arguments.
|
||||
def deriv(f, x, *arg):
|
||||
x = np.asarray(x)
|
||||
h = 1e-10
|
||||
return (f(x + h*1j, *arg)/h).imag
|
||||
|
||||
x0 = distfn.ppf([0.25, 0.51, 0.75], *arg)
|
||||
x_cast = [x0.astype(tp) for tp in
|
||||
(np.int_, np.float16, np.float32, np.float64)]
|
||||
|
||||
for x in x_cast:
|
||||
# casting may have clipped the values, exclude those
|
||||
distfn._argcheck(*arg)
|
||||
x = x[(distfn.a < x) & (x < distfn.b)]
|
||||
|
||||
pdf, cdf, sf = distfn.pdf(x, *arg), distfn.cdf(x, *arg), distfn.sf(x, *arg)
|
||||
assert_allclose(deriv(distfn.cdf, x, *arg), pdf, rtol=1e-5)
|
||||
assert_allclose(deriv(distfn.logcdf, x, *arg), pdf/cdf, rtol=1e-5)
|
||||
|
||||
assert_allclose(deriv(distfn.sf, x, *arg), -pdf, rtol=1e-5)
|
||||
assert_allclose(deriv(distfn.logsf, x, *arg), -pdf/sf, rtol=1e-5)
|
||||
|
||||
assert_allclose(deriv(distfn.logpdf, x, *arg),
|
||||
deriv(distfn.pdf, x, *arg) / distfn.pdf(x, *arg),
|
||||
rtol=1e-5)
|
||||
|
||||
|
||||
def check_pickling(distfn, args):
|
||||
# check that a distribution instance pickles and unpickles
|
||||
# pay special attention to the random_state property
|
||||
|
||||
# save the random_state (restore later)
|
||||
rndm = distfn.random_state
|
||||
|
||||
distfn.random_state = 1234
|
||||
distfn.rvs(*args, size=8)
|
||||
s = pickle.dumps(distfn)
|
||||
r0 = distfn.rvs(*args, size=8)
|
||||
|
||||
unpickled = pickle.loads(s)
|
||||
r1 = unpickled.rvs(*args, size=8)
|
||||
npt.assert_equal(r0, r1)
|
||||
|
||||
# also smoke test some methods
|
||||
medians = [distfn.ppf(0.5, *args), unpickled.ppf(0.5, *args)]
|
||||
npt.assert_equal(medians[0], medians[1])
|
||||
npt.assert_equal(distfn.cdf(medians[0], *args),
|
||||
unpickled.cdf(medians[1], *args))
|
||||
|
||||
# restore the random_state
|
||||
distfn.random_state = rndm
|
||||
|
||||
|
||||
def check_rvs_broadcast(distfunc, distname, allargs, shape, shape_only, otype):
|
||||
np.random.seed(123)
|
||||
with suppress_warnings() as sup:
|
||||
# frechet_l and frechet_r are deprecated, so all their
|
||||
# methods generate DeprecationWarnings.
|
||||
sup.filter(category=DeprecationWarning, message=".*frechet_")
|
||||
sample = distfunc.rvs(*allargs)
|
||||
assert_equal(sample.shape, shape, "%s: rvs failed to broadcast" % distname)
|
||||
if not shape_only:
|
||||
rvs = np.vectorize(lambda *allargs: distfunc.rvs(*allargs), otypes=otype)
|
||||
np.random.seed(123)
|
||||
expected = rvs(*allargs)
|
||||
assert_allclose(sample, expected, rtol=1e-15)
|
||||
@@ -1,108 +0,0 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: AtmWtAg (AtmWtAg.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 108)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Powell, L.J., Murphy, T.J. and Gramlich, J.W. (1982).
|
||||
"The Absolute Isotopic Abundance & Atomic Weight
|
||||
of a Reference Sample of Silver".
|
||||
NBS Journal of Research, 87, pp. 9-19.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
2 Treatments
|
||||
24 Replicates/Cell
|
||||
48 Observations
|
||||
7 Constant Leading Digits
|
||||
Average Level of Difficulty
|
||||
Observed Data
|
||||
|
||||
|
||||
Model: 3 Parameters (mu, tau_1, tau_2)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
|
||||
Between Instrument 1 3.63834187500000E-09 3.63834187500000E-09 1.59467335677930E+01
|
||||
Within Instrument 46 1.04951729166667E-08 2.28155932971014E-10
|
||||
|
||||
Certified R-Squared 2.57426544538321E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.51048314446410E-05
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Instrument AgWt
|
||||
1 107.8681568
|
||||
1 107.8681465
|
||||
1 107.8681572
|
||||
1 107.8681785
|
||||
1 107.8681446
|
||||
1 107.8681903
|
||||
1 107.8681526
|
||||
1 107.8681494
|
||||
1 107.8681616
|
||||
1 107.8681587
|
||||
1 107.8681519
|
||||
1 107.8681486
|
||||
1 107.8681419
|
||||
1 107.8681569
|
||||
1 107.8681508
|
||||
1 107.8681672
|
||||
1 107.8681385
|
||||
1 107.8681518
|
||||
1 107.8681662
|
||||
1 107.8681424
|
||||
1 107.8681360
|
||||
1 107.8681333
|
||||
1 107.8681610
|
||||
1 107.8681477
|
||||
2 107.8681079
|
||||
2 107.8681344
|
||||
2 107.8681513
|
||||
2 107.8681197
|
||||
2 107.8681604
|
||||
2 107.8681385
|
||||
2 107.8681642
|
||||
2 107.8681365
|
||||
2 107.8681151
|
||||
2 107.8681082
|
||||
2 107.8681517
|
||||
2 107.8681448
|
||||
2 107.8681198
|
||||
2 107.8681482
|
||||
2 107.8681334
|
||||
2 107.8681609
|
||||
2 107.8681101
|
||||
2 107.8681512
|
||||
2 107.8681469
|
||||
2 107.8681360
|
||||
2 107.8681254
|
||||
2 107.8681261
|
||||
2 107.8681450
|
||||
2 107.8681368
|
||||
@@ -1,85 +0,0 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SiRstv (SiRstv.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 85)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Ehrstein, James and Croarkin, M. Carroll.
|
||||
Unpublished NIST dataset.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
5 Treatments
|
||||
5 Replicates/Cell
|
||||
25 Observations
|
||||
3 Constant Leading Digits
|
||||
Lower Level of Difficulty
|
||||
Observed Data
|
||||
|
||||
|
||||
Model: 6 Parameters (mu,tau_1, ... , tau_5)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Instrument 4 5.11462616000000E-02 1.27865654000000E-02 1.18046237440255E+00
|
||||
Within Instrument 20 2.16636560000000E-01 1.08318280000000E-02
|
||||
|
||||
Certified R-Squared 1.90999039051129E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.04076068334656E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Instrument Resistance
|
||||
1 196.3052
|
||||
1 196.1240
|
||||
1 196.1890
|
||||
1 196.2569
|
||||
1 196.3403
|
||||
2 196.3042
|
||||
2 196.3825
|
||||
2 196.1669
|
||||
2 196.3257
|
||||
2 196.0422
|
||||
3 196.1303
|
||||
3 196.2005
|
||||
3 196.2889
|
||||
3 196.0343
|
||||
3 196.1811
|
||||
4 196.2795
|
||||
4 196.1748
|
||||
4 196.1494
|
||||
4 196.1485
|
||||
4 195.9885
|
||||
5 196.2119
|
||||
5 196.1051
|
||||
5 196.1850
|
||||
5 196.0052
|
||||
5 196.2090
|
||||
@@ -1,249 +0,0 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SmLs01 (SmLs01.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 249)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Simon, Stephen D. and Lesage, James P. (1989).
|
||||
"Assessing the Accuracy of ANOVA Calculations in
|
||||
Statistical Software".
|
||||
Computational Statistics & Data Analysis, 8, pp. 325-332.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
9 Treatments
|
||||
21 Replicates/Cell
|
||||
189 Observations
|
||||
1 Constant Leading Digit
|
||||
Lower Level of Difficulty
|
||||
Generated Data
|
||||
|
||||
|
||||
Model: 10 Parameters (mu,tau_1, ... , tau_9)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
|
||||
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
|
||||
|
||||
Certified R-Squared 4.82758620689655E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.00000000000000E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Treatment Response
|
||||
1 1.4
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
2 1.3
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
3 1.5
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
4 1.3
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
5 1.5
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
6 1.3
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
7 1.5
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
8 1.3
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
9 1.5
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
File diff suppressed because it is too large
Load Diff
-18069
File diff suppressed because it is too large
Load Diff
@@ -1,249 +0,0 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SmLs04 (SmLs04.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 249)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Simon, Stephen D. and Lesage, James P. (1989).
|
||||
"Assessing the Accuracy of ANOVA Calculations in
|
||||
Statistical Software".
|
||||
Computational Statistics & Data Analysis, 8, pp. 325-332.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
9 Treatments
|
||||
21 Replicates/Cell
|
||||
189 Observations
|
||||
7 Constant Leading Digits
|
||||
Average Level of Difficulty
|
||||
Generated Data
|
||||
|
||||
|
||||
Model: 10 Parameters (mu,tau_1, ... , tau_9)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
|
||||
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
|
||||
|
||||
Certified R-Squared 4.82758620689655E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.00000000000000E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Treatment Response
|
||||
1 1000000.4
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
2 1000000.3
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
3 1000000.5
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
4 1000000.3
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
5 1000000.5
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
6 1000000.3
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
7 1000000.5
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
8 1000000.3
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
9 1000000.5
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
File diff suppressed because it is too large
Load Diff
-18069
File diff suppressed because it is too large
Load Diff
@@ -1,249 +0,0 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SmLs07 (SmLs07.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 249)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Simon, Stephen D. and Lesage, James P. (1989).
|
||||
"Assessing the Accuracy of ANOVA Calculations in
|
||||
Statistical Software".
|
||||
Computational Statistics & Data Analysis, 8, pp. 325-332.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
9 Treatments
|
||||
21 Replicates/Cell
|
||||
189 Observations
|
||||
13 Constant Leading Digits
|
||||
Higher Level of Difficulty
|
||||
Generated Data
|
||||
|
||||
|
||||
Model: 10 Parameters (mu,tau_1, ... , tau_9)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
|
||||
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
|
||||
|
||||
Certified R-Squared 4.82758620689655E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.00000000000000E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Treatment Response
|
||||
1 1000000000000.4
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
2 1000000000000.3
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
3 1000000000000.5
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
4 1000000000000.3
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
5 1000000000000.5
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
6 1000000000000.3
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
7 1000000000000.5
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
8 1000000000000.3
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
9 1000000000000.5
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
File diff suppressed because it is too large
Load Diff
-18069
File diff suppressed because it is too large
Load Diff
-97
@@ -1,97 +0,0 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: Norris (Norris.dat)
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 31 to 46)
|
||||
Data (lines 61 to 96)
|
||||
|
||||
Procedure: Linear Least Squares Regression
|
||||
|
||||
Reference: Norris, J., NIST.
|
||||
Calibration of Ozone Monitors.
|
||||
|
||||
Data: 1 Response Variable (y)
|
||||
1 Predictor Variable (x)
|
||||
36 Observations
|
||||
Lower Level of Difficulty
|
||||
Observed Data
|
||||
|
||||
Model: Linear Class
|
||||
2 Parameters (B0,B1)
|
||||
|
||||
y = B0 + B1*x + e
|
||||
|
||||
|
||||
|
||||
Certified Regression Statistics
|
||||
|
||||
Standard Deviation
|
||||
Parameter Estimate of Estimate
|
||||
|
||||
B0 -0.262323073774029 0.232818234301152
|
||||
B1 1.00211681802045 0.429796848199937E-03
|
||||
|
||||
Residual
|
||||
Standard Deviation 0.884796396144373
|
||||
|
||||
R-Squared 0.999993745883712
|
||||
|
||||
|
||||
Certified Analysis of Variance Table
|
||||
|
||||
Source of Degrees of Sums of Mean
|
||||
Variation Freedom Squares Squares F Statistic
|
||||
|
||||
Regression 1 4255954.13232369 4255954.13232369 5436385.54079785
|
||||
Residual 34 26.6173985294224 0.782864662630069
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: y x
|
||||
0.1 0.2
|
||||
338.8 337.4
|
||||
118.1 118.2
|
||||
888.0 884.6
|
||||
9.2 10.1
|
||||
228.1 226.5
|
||||
668.5 666.3
|
||||
998.5 996.3
|
||||
449.1 448.6
|
||||
778.9 777.0
|
||||
559.2 558.2
|
||||
0.3 0.4
|
||||
0.1 0.6
|
||||
778.1 775.5
|
||||
668.8 666.9
|
||||
339.3 338.0
|
||||
448.9 447.5
|
||||
10.8 11.6
|
||||
557.7 556.0
|
||||
228.3 228.1
|
||||
998.0 995.8
|
||||
888.8 887.6
|
||||
119.6 120.2
|
||||
0.3 0.3
|
||||
0.6 0.3
|
||||
557.6 556.8
|
||||
339.3 339.1
|
||||
888.0 887.2
|
||||
998.5 999.0
|
||||
778.9 779.0
|
||||
10.2 11.1
|
||||
117.6 118.3
|
||||
228.9 229.2
|
||||
668.4 669.1
|
||||
449.2 448.9
|
||||
0.2 0.5
|
||||
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -1,437 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy.stats import (binned_statistic, binned_statistic_2d,
|
||||
binned_statistic_dd)
|
||||
|
||||
from scipy._lib.six import u
|
||||
from .common_tests import check_named_results
|
||||
|
||||
|
||||
class TestBinnedStatistic(object):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
np.random.seed(9865)
|
||||
cls.x = np.random.random(100)
|
||||
cls.y = np.random.random(100)
|
||||
cls.v = np.random.random(100)
|
||||
cls.X = np.random.random((100, 3))
|
||||
cls.w = np.random.random(100)
|
||||
|
||||
def test_1d_count(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
|
||||
count2, edges2 = np.histogram(x, bins=10)
|
||||
|
||||
assert_allclose(count1, count2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_gh5927(self):
|
||||
# smoke test for gh5927 - binned_statistic was using `is` for string
|
||||
# comparison
|
||||
x = self.x
|
||||
v = self.v
|
||||
statistics = [u'mean', u'median', u'count', u'sum']
|
||||
for statistic in statistics:
|
||||
res = binned_statistic(x, v, statistic, bins=10)
|
||||
|
||||
def test_1d_result_attributes(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
res = binned_statistic(x, v, 'count', bins=10)
|
||||
attributes = ('statistic', 'bin_edges', 'binnumber')
|
||||
check_named_results(res, attributes)
|
||||
|
||||
def test_1d_sum(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
|
||||
sum2, edges2 = np.histogram(x, bins=10, weights=v)
|
||||
|
||||
assert_allclose(sum1, sum2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_mean(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_std(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_min(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'min', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.min, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_max(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'max', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.max, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_median(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_bincode(self):
|
||||
x = self.x[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
|
||||
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
|
||||
1, 2, 1])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_allclose(bc, bc2)
|
||||
assert_allclose(bcount, count1)
|
||||
|
||||
def test_1d_range_keyword(self):
|
||||
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
|
||||
np.random.seed(9865)
|
||||
x = np.arange(30)
|
||||
data = np.random.random(30)
|
||||
|
||||
mean, bins, _ = binned_statistic(x[:15], data[:15])
|
||||
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
|
||||
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
|
||||
|
||||
assert_allclose(mean, mean_range)
|
||||
assert_allclose(bins, bins_range)
|
||||
assert_allclose(mean, mean_range2)
|
||||
assert_allclose(bins, bins_range2)
|
||||
|
||||
def test_1d_multi_values(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
w = self.w
|
||||
|
||||
stat1v, edges1v, bc1v = binned_statistic(x, v, 'mean', bins=10)
|
||||
stat1w, edges1w, bc1w = binned_statistic(x, w, 'mean', bins=10)
|
||||
stat2, edges2, bc2 = binned_statistic(x, [v, w], 'mean', bins=10)
|
||||
|
||||
assert_allclose(stat2[0], stat1v)
|
||||
assert_allclose(stat2[1], stat1w)
|
||||
assert_allclose(edges1v, edges2)
|
||||
assert_allclose(bc1v, bc2)
|
||||
|
||||
def test_2d_count(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
count1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, 'count', bins=5)
|
||||
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
|
||||
|
||||
assert_allclose(count1, count2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_result_attributes(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
res = binned_statistic_2d(x, y, v, 'count', bins=5)
|
||||
attributes = ('statistic', 'x_edge', 'y_edge', 'binnumber')
|
||||
check_named_results(res, attributes)
|
||||
|
||||
def test_2d_sum(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
|
||||
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
|
||||
|
||||
assert_allclose(sum1, sum2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_mean(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_mean_unicode(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, u('mean'), bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_std(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_min(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'min', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.min, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_max(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'max', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.max, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_median(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, 'median', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(
|
||||
x, y, v, np.median, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_bincode(self):
|
||||
x = self.x[:20]
|
||||
y = self.y[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, 'count', bins=3)
|
||||
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
|
||||
6, 11, 16, 6, 6, 11, 8])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_allclose(bc, bc2)
|
||||
count1adj = count1[count1.nonzero()]
|
||||
assert_allclose(bcount, count1adj)
|
||||
|
||||
def test_2d_multi_values(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
w = self.w
|
||||
|
||||
stat1v, binx1v, biny1v, bc1v = binned_statistic_2d(
|
||||
x, y, v, 'mean', bins=8)
|
||||
stat1w, binx1w, biny1w, bc1w = binned_statistic_2d(
|
||||
x, y, w, 'mean', bins=8)
|
||||
stat2, binx2, biny2, bc2 = binned_statistic_2d(
|
||||
x, y, [v, w], 'mean', bins=8)
|
||||
|
||||
assert_allclose(stat2[0], stat1v)
|
||||
assert_allclose(stat2[1], stat1w)
|
||||
assert_allclose(binx1v, binx2)
|
||||
assert_allclose(biny1w, biny2)
|
||||
assert_allclose(bc1v, bc2)
|
||||
|
||||
def test_2d_binnumbers_unraveled(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat, edgesx, bcx = binned_statistic(x, v, 'mean', bins=20)
|
||||
stat, edgesy, bcy = binned_statistic(y, v, 'mean', bins=10)
|
||||
|
||||
stat2, edgesx2, edgesy2, bc2 = binned_statistic_2d(
|
||||
x, y, v, 'mean', bins=(20, 10), expand_binnumbers=True)
|
||||
|
||||
bcx3 = np.searchsorted(edgesx, x, side='right')
|
||||
bcy3 = np.searchsorted(edgesy, y, side='right')
|
||||
|
||||
# `numpy.searchsorted` is non-inclusive on right-edge, compensate
|
||||
bcx3[x == x.max()] -= 1
|
||||
bcy3[y == y.max()] -= 1
|
||||
|
||||
assert_allclose(bcx, bc2[0])
|
||||
assert_allclose(bcy, bc2[1])
|
||||
assert_allclose(bcx3, bc2[0])
|
||||
assert_allclose(bcy3, bc2[1])
|
||||
|
||||
def test_dd_count(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
count2, edges2 = np.histogramdd(X, bins=3)
|
||||
|
||||
assert_allclose(count1, count2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_result_attributes(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
res = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
attributes = ('statistic', 'bin_edges', 'binnumber')
|
||||
check_named_results(res, attributes)
|
||||
|
||||
def test_dd_sum(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
|
||||
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
|
||||
|
||||
assert_allclose(sum1, sum2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_mean(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_std(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_min(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'min', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.min, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_max(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'max', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.max, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_median(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_bincode(self):
|
||||
X = self.X[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
|
||||
32, 36, 91, 43, 87, 81, 81])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_allclose(bc, bc2)
|
||||
count1adj = count1[count1.nonzero()]
|
||||
assert_allclose(bcount, count1adj)
|
||||
|
||||
def test_dd_multi_values(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
w = self.w
|
||||
|
||||
stat1v, edges1v, bc1v = binned_statistic_dd(X, v, np.std, bins=8)
|
||||
stat1w, edges1w, bc1w = binned_statistic_dd(X, w, np.std, bins=8)
|
||||
stat2, edges2, bc2 = binned_statistic_dd(X, [v, w], np.std, bins=8)
|
||||
|
||||
assert_allclose(stat2[0], stat1v)
|
||||
assert_allclose(stat2[1], stat1w)
|
||||
assert_allclose(edges1v, edges2)
|
||||
assert_allclose(edges1w, edges2)
|
||||
assert_allclose(bc1v, bc2)
|
||||
|
||||
def test_dd_binnumbers_unraveled(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat, edgesx, bcx = binned_statistic(X[:, 0], v, 'mean', bins=15)
|
||||
stat, edgesy, bcy = binned_statistic(X[:, 1], v, 'mean', bins=20)
|
||||
stat, edgesz, bcz = binned_statistic(X[:, 2], v, 'mean', bins=10)
|
||||
|
||||
stat2, edges2, bc2 = binned_statistic_dd(
|
||||
X, v, 'mean', bins=(15, 20, 10), expand_binnumbers=True)
|
||||
|
||||
assert_allclose(bcx, bc2[0])
|
||||
assert_allclose(bcy, bc2[1])
|
||||
assert_allclose(bcz, bc2[2])
|
||||
@@ -1,200 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import (assert_equal, assert_array_equal,
|
||||
assert_array_almost_equal, assert_approx_equal, assert_allclose)
|
||||
from pytest import raises as assert_raises
|
||||
|
||||
from scipy.special import xlogy
|
||||
from scipy.stats.contingency import margins, expected_freq, chi2_contingency
|
||||
|
||||
|
||||
def test_margins():
|
||||
a = np.array([1])
|
||||
m = margins(a)
|
||||
assert_equal(len(m), 1)
|
||||
m0 = m[0]
|
||||
assert_array_equal(m0, np.array([1]))
|
||||
|
||||
a = np.array([[1]])
|
||||
m0, m1 = margins(a)
|
||||
expected0 = np.array([[1]])
|
||||
expected1 = np.array([[1]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
|
||||
a = np.arange(12).reshape(2, 6)
|
||||
m0, m1 = margins(a)
|
||||
expected0 = np.array([[15], [51]])
|
||||
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
|
||||
a = np.arange(24).reshape(2, 3, 4)
|
||||
m0, m1, m2 = margins(a)
|
||||
expected0 = np.array([[[66]], [[210]]])
|
||||
expected1 = np.array([[[60], [92], [124]]])
|
||||
expected2 = np.array([[[60, 66, 72, 78]]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
assert_array_equal(m2, expected2)
|
||||
|
||||
|
||||
def test_expected_freq():
|
||||
assert_array_equal(expected_freq([1]), np.array([1.0]))
|
||||
|
||||
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
|
||||
e = expected_freq(observed)
|
||||
assert_array_equal(e, np.ones_like(observed))
|
||||
|
||||
observed = np.array([[10, 10, 20], [20, 20, 20]])
|
||||
e = expected_freq(observed)
|
||||
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
|
||||
assert_array_almost_equal(e, correct)
|
||||
|
||||
|
||||
def test_chi2_contingency_trivial():
|
||||
# Some very simple tests for chi2_contingency.
|
||||
|
||||
# A trivial case
|
||||
obs = np.array([[1, 2], [1, 2]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||
assert_equal(chi2, 0.0)
|
||||
assert_equal(p, 1.0)
|
||||
assert_equal(dof, 1)
|
||||
assert_array_equal(obs, expected)
|
||||
|
||||
# A *really* trivial case: 1-D data.
|
||||
obs = np.array([1, 2, 3])
|
||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||
assert_equal(chi2, 0.0)
|
||||
assert_equal(p, 1.0)
|
||||
assert_equal(dof, 0)
|
||||
assert_array_equal(obs, expected)
|
||||
|
||||
|
||||
def test_chi2_contingency_R():
|
||||
# Some test cases that were computed independently, using R.
|
||||
|
||||
Rcode = \
|
||||
"""
|
||||
# Data vector.
|
||||
data <- c(
|
||||
12, 34, 23, 4, 47, 11,
|
||||
35, 31, 11, 34, 10, 18,
|
||||
12, 32, 9, 18, 13, 19,
|
||||
12, 12, 14, 9, 33, 25
|
||||
)
|
||||
|
||||
# Create factor tags:r=rows, c=columns, t=tiers
|
||||
r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
|
||||
c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
|
||||
t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
|
||||
|
||||
# 3-way Chi squared test of independence
|
||||
s = summary(xtabs(data~r+c+t))
|
||||
print(s)
|
||||
"""
|
||||
Routput = \
|
||||
"""
|
||||
Call: xtabs(formula = data ~ r + c + t)
|
||||
Number of cases in table: 478
|
||||
Number of factors: 3
|
||||
Test for independence of all factors:
|
||||
Chisq = 102.17, df = 17, p-value = 3.514e-14
|
||||
"""
|
||||
obs = np.array(
|
||||
[[[12, 34, 23],
|
||||
[35, 31, 11],
|
||||
[12, 32, 9],
|
||||
[12, 12, 14]],
|
||||
[[4, 47, 11],
|
||||
[34, 10, 18],
|
||||
[18, 13, 19],
|
||||
[9, 33, 25]]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs)
|
||||
assert_approx_equal(chi2, 102.17, significant=5)
|
||||
assert_approx_equal(p, 3.514e-14, significant=4)
|
||||
assert_equal(dof, 17)
|
||||
|
||||
Rcode = \
|
||||
"""
|
||||
# Data vector.
|
||||
data <- c(
|
||||
#
|
||||
12, 17,
|
||||
11, 16,
|
||||
#
|
||||
11, 12,
|
||||
15, 16,
|
||||
#
|
||||
23, 15,
|
||||
30, 22,
|
||||
#
|
||||
14, 17,
|
||||
15, 16
|
||||
)
|
||||
|
||||
# Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
|
||||
r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
|
||||
c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
|
||||
d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
|
||||
t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
|
||||
|
||||
# 4-way Chi squared test of independence
|
||||
s = summary(xtabs(data~r+c+d+t))
|
||||
print(s)
|
||||
"""
|
||||
Routput = \
|
||||
"""
|
||||
Call: xtabs(formula = data ~ r + c + d + t)
|
||||
Number of cases in table: 262
|
||||
Number of factors: 4
|
||||
Test for independence of all factors:
|
||||
Chisq = 8.758, df = 11, p-value = 0.6442
|
||||
"""
|
||||
obs = np.array(
|
||||
[[[[12, 17],
|
||||
[11, 16]],
|
||||
[[11, 12],
|
||||
[15, 16]]],
|
||||
[[[23, 15],
|
||||
[30, 22]],
|
||||
[[14, 17],
|
||||
[15, 16]]]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs)
|
||||
assert_approx_equal(chi2, 8.758, significant=4)
|
||||
assert_approx_equal(p, 0.6442, significant=4)
|
||||
assert_equal(dof, 11)
|
||||
|
||||
|
||||
def test_chi2_contingency_g():
|
||||
c = np.array([[15, 60], [15, 90]])
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
|
||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
|
||||
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
|
||||
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
|
||||
|
||||
c = np.array([[10, 12, 10], [12, 10, 10]])
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
|
||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||
|
||||
|
||||
def test_chi2_contingency_bad_args():
|
||||
# Test that "bad" inputs raise a ValueError.
|
||||
|
||||
# Negative value in the array of observed frequencies.
|
||||
obs = np.array([[-1, 10], [1, 2]])
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
# The zeros in this will result in zeros in the array
|
||||
# of expected frequencies.
|
||||
obs = np.array([[0, 1], [0, 1]])
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
# A degenerate case: `observed` has size 0.
|
||||
obs = np.empty((0, 8))
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
@@ -1,420 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
import numpy.testing as npt
|
||||
import pytest
|
||||
from pytest import raises as assert_raises
|
||||
from scipy._lib._numpy_compat import suppress_warnings
|
||||
from scipy.integrate import IntegrationWarning
|
||||
|
||||
from scipy import stats
|
||||
from scipy.special import betainc
|
||||
from. common_tests import (check_normalization, check_moment, check_mean_expect,
|
||||
check_var_expect, check_skew_expect,
|
||||
check_kurt_expect, check_entropy,
|
||||
check_private_entropy, check_entropy_vect_scale,
|
||||
check_edge_support, check_named_args,
|
||||
check_random_state_property,
|
||||
check_meth_dtype, check_ppf_dtype, check_cmplx_deriv,
|
||||
check_pickling, check_rvs_broadcast)
|
||||
from scipy.stats._distr_params import distcont
|
||||
|
||||
"""
|
||||
Test all continuous distributions.
|
||||
|
||||
Parameters were chosen for those distributions that pass the
|
||||
Kolmogorov-Smirnov test. This provides safe parameters for each
|
||||
distributions so that we can perform further testing of class methods.
|
||||
|
||||
These tests currently check only/mostly for serious errors and exceptions,
|
||||
not for numerically exact results.
|
||||
"""
|
||||
|
||||
# Note that you need to add new distributions you want tested
|
||||
# to _distr_params
|
||||
|
||||
DECIMAL = 5 # specify the precision of the tests # increased from 0 to 5
|
||||
|
||||
# Last four of these fail all around. Need to be checked
|
||||
distcont_extra = [
|
||||
['betaprime', (100, 86)],
|
||||
['fatiguelife', (5,)],
|
||||
['mielke', (4.6420495492121487, 0.59707419545516938)],
|
||||
['invweibull', (0.58847112119264788,)],
|
||||
# burr: sample mean test fails still for c<1
|
||||
['burr', (0.94839838075366045, 4.3820284068855795)],
|
||||
# genextreme: sample mean test, sf-logsf test fail
|
||||
['genextreme', (3.3184017469423535,)],
|
||||
]
|
||||
|
||||
|
||||
distslow = ['kappa4', 'rdist', 'gausshyper',
|
||||
'recipinvgauss', 'ksone', 'genexpon',
|
||||
'vonmises', 'vonmises_line', 'mielke', 'semicircular',
|
||||
'cosine', 'invweibull', 'powerlognorm', 'johnsonsu', 'kstwobign']
|
||||
# distslow are sorted by speed (very slow to slow)
|
||||
|
||||
|
||||
# These distributions fail the complex derivative test below.
|
||||
# Here 'fail' mean produce wrong results and/or raise exceptions, depending
|
||||
# on the implementation details of corresponding special functions.
|
||||
# cf https://github.com/scipy/scipy/pull/4979 for a discussion.
|
||||
fails_cmplx = set(['beta', 'betaprime', 'chi', 'chi2', 'dgamma', 'dweibull',
|
||||
'erlang', 'f', 'gamma', 'gausshyper', 'gengamma',
|
||||
'gennorm', 'genpareto', 'halfgennorm', 'invgamma',
|
||||
'ksone', 'kstwobign', 'levy_l', 'loggamma', 'logistic',
|
||||
'maxwell', 'nakagami', 'ncf', 'nct', 'ncx2', 'norminvgauss',
|
||||
'pearson3', 'rice', 't', 'skewnorm', 'tukeylambda',
|
||||
'vonmises', 'vonmises_line', 'rv_histogram_instance'])
|
||||
|
||||
_h = np.histogram([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6,
|
||||
6, 6, 6, 7, 7, 7, 8, 8, 9], bins=8)
|
||||
histogram_test_instance = stats.rv_histogram(_h)
|
||||
|
||||
|
||||
def cases_test_cont_basic():
|
||||
for distname, arg in distcont[:] + [(histogram_test_instance, tuple())]:
|
||||
if distname == 'levy_stable':
|
||||
continue
|
||||
elif distname in distslow:
|
||||
yield pytest.param(distname, arg, marks=pytest.mark.slow)
|
||||
else:
|
||||
yield distname, arg
|
||||
|
||||
|
||||
@pytest.mark.parametrize('distname,arg', cases_test_cont_basic())
|
||||
def test_cont_basic(distname, arg):
|
||||
# this test skips slow distributions
|
||||
|
||||
if distname == 'truncnorm':
|
||||
pytest.xfail(reason=distname)
|
||||
|
||||
try:
|
||||
distfn = getattr(stats, distname)
|
||||
except TypeError:
|
||||
distfn = distname
|
||||
distname = 'rv_histogram_instance'
|
||||
np.random.seed(765456)
|
||||
sn = 500
|
||||
with suppress_warnings() as sup:
|
||||
# frechet_l and frechet_r are deprecated, so all their
|
||||
# methods generate DeprecationWarnings.
|
||||
sup.filter(category=DeprecationWarning, message=".*frechet_")
|
||||
rvs = distfn.rvs(size=sn, *arg)
|
||||
sm = rvs.mean()
|
||||
sv = rvs.var()
|
||||
m, v = distfn.stats(*arg)
|
||||
|
||||
check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, distname + 'sample mean test')
|
||||
check_cdf_ppf(distfn, arg, distname)
|
||||
check_sf_isf(distfn, arg, distname)
|
||||
check_pdf(distfn, arg, distname)
|
||||
check_pdf_logpdf(distfn, arg, distname)
|
||||
check_cdf_logcdf(distfn, arg, distname)
|
||||
check_sf_logsf(distfn, arg, distname)
|
||||
|
||||
alpha = 0.01
|
||||
if distname == 'rv_histogram_instance':
|
||||
check_distribution_rvs(distfn.cdf, arg, alpha, rvs)
|
||||
else:
|
||||
check_distribution_rvs(distname, arg, alpha, rvs)
|
||||
|
||||
locscale_defaults = (0, 1)
|
||||
meths = [distfn.pdf, distfn.logpdf, distfn.cdf, distfn.logcdf,
|
||||
distfn.logsf]
|
||||
# make sure arguments are within support
|
||||
spec_x = {'frechet_l': -0.5, 'weibull_max': -0.5, 'levy_l': -0.5,
|
||||
'pareto': 1.5, 'tukeylambda': 0.3,
|
||||
'rv_histogram_instance': 5.0}
|
||||
x = spec_x.get(distname, 0.5)
|
||||
if distname == 'invweibull':
|
||||
arg = (1,)
|
||||
elif distname == 'ksone':
|
||||
arg = (3,)
|
||||
check_named_args(distfn, x, arg, locscale_defaults, meths)
|
||||
check_random_state_property(distfn, arg)
|
||||
check_pickling(distfn, arg)
|
||||
|
||||
# Entropy
|
||||
if distname not in ['ksone', 'kstwobign']:
|
||||
check_entropy(distfn, arg, distname)
|
||||
|
||||
if distfn.numargs == 0:
|
||||
check_vecentropy(distfn, arg)
|
||||
|
||||
if (distfn.__class__._entropy != stats.rv_continuous._entropy
|
||||
and distname != 'vonmises'):
|
||||
check_private_entropy(distfn, arg, stats.rv_continuous)
|
||||
|
||||
with suppress_warnings() as sup:
|
||||
sup.filter(IntegrationWarning, "The occurrence of roundoff error")
|
||||
sup.filter(IntegrationWarning, "Extremely bad integrand")
|
||||
sup.filter(RuntimeWarning, "invalid value")
|
||||
check_entropy_vect_scale(distfn, arg)
|
||||
|
||||
check_edge_support(distfn, arg)
|
||||
|
||||
check_meth_dtype(distfn, arg, meths)
|
||||
check_ppf_dtype(distfn, arg)
|
||||
|
||||
if distname not in fails_cmplx:
|
||||
check_cmplx_deriv(distfn, arg)
|
||||
|
||||
if distname != 'truncnorm':
|
||||
check_ppf_private(distfn, arg, distname)
|
||||
|
||||
|
||||
def test_levy_stable_random_state_property():
|
||||
# levy_stable only implements rvs(), so it is skipped in the
|
||||
# main loop in test_cont_basic(). Here we apply just the test
|
||||
# check_random_state_property to levy_stable.
|
||||
check_random_state_property(stats.levy_stable, (0.5, 0.1))
|
||||
|
||||
|
||||
def cases_test_moments():
|
||||
fail_normalization = set(['vonmises', 'ksone'])
|
||||
fail_higher = set(['vonmises', 'ksone', 'ncf'])
|
||||
|
||||
for distname, arg in distcont[:] + [(histogram_test_instance, tuple())]:
|
||||
if distname == 'levy_stable':
|
||||
continue
|
||||
|
||||
cond1 = distname not in fail_normalization
|
||||
cond2 = distname not in fail_higher
|
||||
|
||||
yield distname, arg, cond1, cond2, False
|
||||
|
||||
if not cond1 or not cond2:
|
||||
# Run the distributions that have issues twice, once skipping the
|
||||
# not_ok parts, once with the not_ok parts but marked as knownfail
|
||||
yield pytest.param(distname, arg, True, True, True,
|
||||
marks=pytest.mark.xfail)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize('distname,arg,normalization_ok,higher_ok,is_xfailing',
|
||||
cases_test_moments())
|
||||
def test_moments(distname, arg, normalization_ok, higher_ok, is_xfailing):
|
||||
try:
|
||||
distfn = getattr(stats, distname)
|
||||
except TypeError:
|
||||
distfn = distname
|
||||
distname = 'rv_histogram_instance'
|
||||
|
||||
with suppress_warnings() as sup:
|
||||
sup.filter(IntegrationWarning,
|
||||
"The integral is probably divergent, or slowly convergent.")
|
||||
sup.filter(category=DeprecationWarning, message=".*frechet_")
|
||||
if is_xfailing:
|
||||
sup.filter(IntegrationWarning)
|
||||
|
||||
m, v, s, k = distfn.stats(*arg, moments='mvsk')
|
||||
|
||||
if normalization_ok:
|
||||
check_normalization(distfn, arg, distname)
|
||||
|
||||
if higher_ok:
|
||||
check_mean_expect(distfn, arg, m, distname)
|
||||
check_skew_expect(distfn, arg, m, v, s, distname)
|
||||
check_var_expect(distfn, arg, m, v, distname)
|
||||
check_kurt_expect(distfn, arg, m, v, k, distname)
|
||||
|
||||
check_loc_scale(distfn, arg, m, v, distname)
|
||||
check_moment(distfn, arg, m, v, distname)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dist,shape_args', distcont)
|
||||
def test_rvs_broadcast(dist, shape_args):
|
||||
if dist in ['gausshyper', 'genexpon']:
|
||||
pytest.skip("too slow")
|
||||
|
||||
# If shape_only is True, it means the _rvs method of the
|
||||
# distribution uses more than one random number to generate a random
|
||||
# variate. That means the result of using rvs with broadcasting or
|
||||
# with a nontrivial size will not necessarily be the same as using the
|
||||
# numpy.vectorize'd version of rvs(), so we can only compare the shapes
|
||||
# of the results, not the values.
|
||||
# Whether or not a distribution is in the following list is an
|
||||
# implementation detail of the distribution, not a requirement. If
|
||||
# the implementation the rvs() method of a distribution changes, this
|
||||
# test might also have to be changed.
|
||||
shape_only = dist in ['betaprime', 'dgamma', 'exponnorm', 'norminvgauss',
|
||||
'nct', 'dweibull', 'rice', 'levy_stable', 'skewnorm']
|
||||
|
||||
distfunc = getattr(stats, dist)
|
||||
loc = np.zeros(2)
|
||||
scale = np.ones((3, 1))
|
||||
nargs = distfunc.numargs
|
||||
allargs = []
|
||||
bshape = [3, 2]
|
||||
# Generate shape parameter arguments...
|
||||
for k in range(nargs):
|
||||
shp = (k + 4,) + (1,)*(k + 2)
|
||||
allargs.append(shape_args[k]*np.ones(shp))
|
||||
bshape.insert(0, k + 4)
|
||||
allargs.extend([loc, scale])
|
||||
# bshape holds the expected shape when loc, scale, and the shape
|
||||
# parameters are all broadcast together.
|
||||
|
||||
check_rvs_broadcast(distfunc, dist, allargs, bshape, shape_only, 'd')
|
||||
|
||||
|
||||
def test_rvs_gh2069_regression():
|
||||
# Regression tests for gh-2069. In scipy 0.17 and earlier,
|
||||
# these tests would fail.
|
||||
#
|
||||
# A typical example of the broken behavior:
|
||||
# >>> norm.rvs(loc=np.zeros(5), scale=np.ones(5))
|
||||
# array([-2.49613705, -2.49613705, -2.49613705, -2.49613705, -2.49613705])
|
||||
np.random.seed(123)
|
||||
vals = stats.norm.rvs(loc=np.zeros(5), scale=1)
|
||||
d = np.diff(vals)
|
||||
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
|
||||
vals = stats.norm.rvs(loc=0, scale=np.ones(5))
|
||||
d = np.diff(vals)
|
||||
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
|
||||
vals = stats.norm.rvs(loc=np.zeros(5), scale=np.ones(5))
|
||||
d = np.diff(vals)
|
||||
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
|
||||
vals = stats.norm.rvs(loc=np.array([[0], [0]]), scale=np.ones(5))
|
||||
d = np.diff(vals.ravel())
|
||||
npt.assert_(np.all(d != 0), "All the values are equal, but they shouldn't be!")
|
||||
|
||||
assert_raises(ValueError, stats.norm.rvs, [[0, 0], [0, 0]],
|
||||
[[1, 1], [1, 1]], 1)
|
||||
assert_raises(ValueError, stats.gamma.rvs, [2, 3, 4, 5], 0, 1, (2, 2))
|
||||
assert_raises(ValueError, stats.gamma.rvs, [1, 1, 1, 1], [0, 0, 0, 0],
|
||||
[[1], [2]], (4,))
|
||||
|
||||
|
||||
def check_sample_meanvar_(distfn, arg, m, v, sm, sv, sn, msg):
|
||||
# this did not work, skipped silently by nose
|
||||
if np.isfinite(m):
|
||||
check_sample_mean(sm, sv, sn, m)
|
||||
if np.isfinite(v):
|
||||
check_sample_var(sv, sn, v)
|
||||
|
||||
|
||||
def check_sample_mean(sm, v, n, popmean):
|
||||
# from stats.stats.ttest_1samp(a, popmean):
|
||||
# Calculates the t-obtained for the independent samples T-test on ONE group
|
||||
# of scores a, given a population mean.
|
||||
#
|
||||
# Returns: t-value, two-tailed prob
|
||||
df = n-1
|
||||
svar = ((n-1)*v) / float(df) # looks redundant
|
||||
t = (sm-popmean) / np.sqrt(svar*(1.0/n))
|
||||
prob = betainc(0.5*df, 0.5, df/(df + t*t))
|
||||
|
||||
# return t,prob
|
||||
npt.assert_(prob > 0.01, 'mean fail, t,prob = %f, %f, m, sm=%f,%f' %
|
||||
(t, prob, popmean, sm))
|
||||
|
||||
|
||||
def check_sample_var(sv, n, popvar):
|
||||
# two-sided chisquare test for sample variance equal to
|
||||
# hypothesized variance
|
||||
df = n-1
|
||||
chi2 = (n-1)*popvar/float(popvar)
|
||||
pval = stats.distributions.chi2.sf(chi2, df) * 2
|
||||
npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' %
|
||||
(chi2, pval, popvar, sv))
|
||||
|
||||
|
||||
def check_cdf_ppf(distfn, arg, msg):
|
||||
values = [0.001, 0.5, 0.999]
|
||||
npt.assert_almost_equal(distfn.cdf(distfn.ppf(values, *arg), *arg),
|
||||
values, decimal=DECIMAL, err_msg=msg +
|
||||
' - cdf-ppf roundtrip')
|
||||
|
||||
|
||||
def check_sf_isf(distfn, arg, msg):
|
||||
npt.assert_almost_equal(distfn.sf(distfn.isf([0.1, 0.5, 0.9], *arg), *arg),
|
||||
[0.1, 0.5, 0.9], decimal=DECIMAL, err_msg=msg +
|
||||
' - sf-isf roundtrip')
|
||||
npt.assert_almost_equal(distfn.cdf([0.1, 0.9], *arg),
|
||||
1.0 - distfn.sf([0.1, 0.9], *arg),
|
||||
decimal=DECIMAL, err_msg=msg +
|
||||
' - cdf-sf relationship')
|
||||
|
||||
|
||||
def check_pdf(distfn, arg, msg):
|
||||
# compares pdf at median with numerical derivative of cdf
|
||||
median = distfn.ppf(0.5, *arg)
|
||||
eps = 1e-6
|
||||
pdfv = distfn.pdf(median, *arg)
|
||||
if (pdfv < 1e-4) or (pdfv > 1e4):
|
||||
# avoid checking a case where pdf is close to zero or
|
||||
# huge (singularity)
|
||||
median = median + 0.1
|
||||
pdfv = distfn.pdf(median, *arg)
|
||||
cdfdiff = (distfn.cdf(median + eps, *arg) -
|
||||
distfn.cdf(median - eps, *arg))/eps/2.0
|
||||
# replace with better diff and better test (more points),
|
||||
# actually, this works pretty well
|
||||
msg += ' - cdf-pdf relationship'
|
||||
npt.assert_almost_equal(pdfv, cdfdiff, decimal=DECIMAL, err_msg=msg)
|
||||
|
||||
|
||||
def check_pdf_logpdf(distfn, args, msg):
|
||||
# compares pdf at several points with the log of the pdf
|
||||
points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
|
||||
vals = distfn.ppf(points, *args)
|
||||
pdf = distfn.pdf(vals, *args)
|
||||
logpdf = distfn.logpdf(vals, *args)
|
||||
pdf = pdf[pdf != 0]
|
||||
logpdf = logpdf[np.isfinite(logpdf)]
|
||||
msg += " - logpdf-log(pdf) relationship"
|
||||
npt.assert_almost_equal(np.log(pdf), logpdf, decimal=7, err_msg=msg)
|
||||
|
||||
|
||||
def check_sf_logsf(distfn, args, msg):
|
||||
# compares sf at several points with the log of the sf
|
||||
points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
|
||||
vals = distfn.ppf(points, *args)
|
||||
sf = distfn.sf(vals, *args)
|
||||
logsf = distfn.logsf(vals, *args)
|
||||
sf = sf[sf != 0]
|
||||
logsf = logsf[np.isfinite(logsf)]
|
||||
msg += " - logsf-log(sf) relationship"
|
||||
npt.assert_almost_equal(np.log(sf), logsf, decimal=7, err_msg=msg)
|
||||
|
||||
|
||||
def check_cdf_logcdf(distfn, args, msg):
|
||||
# compares cdf at several points with the log of the cdf
|
||||
points = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
|
||||
vals = distfn.ppf(points, *args)
|
||||
cdf = distfn.cdf(vals, *args)
|
||||
logcdf = distfn.logcdf(vals, *args)
|
||||
cdf = cdf[cdf != 0]
|
||||
logcdf = logcdf[np.isfinite(logcdf)]
|
||||
msg += " - logcdf-log(cdf) relationship"
|
||||
npt.assert_almost_equal(np.log(cdf), logcdf, decimal=7, err_msg=msg)
|
||||
|
||||
|
||||
def check_distribution_rvs(dist, args, alpha, rvs):
|
||||
# test from scipy.stats.tests
|
||||
# this version reuses existing random variables
|
||||
D, pval = stats.kstest(rvs, dist, args=args, N=1000)
|
||||
if (pval < alpha):
|
||||
D, pval = stats.kstest(dist, '', args=args, N=1000)
|
||||
npt.assert_(pval > alpha, "D = " + str(D) + "; pval = " + str(pval) +
|
||||
"; alpha = " + str(alpha) + "\nargs = " + str(args))
|
||||
|
||||
|
||||
def check_vecentropy(distfn, args):
|
||||
npt.assert_equal(distfn.vecentropy(*args), distfn._entropy(*args))
|
||||
|
||||
|
||||
def check_loc_scale(distfn, arg, m, v, msg):
|
||||
loc, scale = 10.0, 10.0
|
||||
mt, vt = distfn.stats(loc=loc, scale=scale, *arg)
|
||||
npt.assert_allclose(m*scale + loc, mt)
|
||||
npt.assert_allclose(v*scale*scale, vt)
|
||||
|
||||
|
||||
def check_ppf_private(distfn, arg, msg):
|
||||
# fails by design for truncnorm self.nb not defined
|
||||
ppfs = distfn._ppf(np.array([0.1, 0.5, 0.9]), *arg)
|
||||
npt.assert_(not np.any(np.isnan(ppfs)), msg + 'ppf private is nan')
|
||||
|
||||
@@ -1,234 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy.testing as npt
|
||||
import numpy as np
|
||||
from scipy._lib.six import xrange
|
||||
import pytest
|
||||
|
||||
from scipy import stats
|
||||
from .common_tests import (check_normalization, check_moment, check_mean_expect,
|
||||
check_var_expect, check_skew_expect,
|
||||
check_kurt_expect, check_entropy,
|
||||
check_private_entropy, check_edge_support,
|
||||
check_named_args, check_random_state_property,
|
||||
check_pickling, check_rvs_broadcast)
|
||||
from scipy.stats._distr_params import distdiscrete
|
||||
|
||||
vals = ([1, 2, 3, 4], [0.1, 0.2, 0.3, 0.4])
|
||||
distdiscrete += [[stats.rv_discrete(values=vals), ()]]
|
||||
|
||||
|
||||
def cases_test_discrete_basic():
|
||||
seen = set()
|
||||
for distname, arg in distdiscrete:
|
||||
yield distname, arg, distname not in seen
|
||||
seen.add(distname)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('distname,arg,first_case', cases_test_discrete_basic())
|
||||
def test_discrete_basic(distname, arg, first_case):
|
||||
try:
|
||||
distfn = getattr(stats, distname)
|
||||
except TypeError:
|
||||
distfn = distname
|
||||
distname = 'sample distribution'
|
||||
np.random.seed(9765456)
|
||||
rvs = distfn.rvs(size=2000, *arg)
|
||||
supp = np.unique(rvs)
|
||||
m, v = distfn.stats(*arg)
|
||||
check_cdf_ppf(distfn, arg, supp, distname + ' cdf_ppf')
|
||||
|
||||
check_pmf_cdf(distfn, arg, distname)
|
||||
check_oth(distfn, arg, supp, distname + ' oth')
|
||||
check_edge_support(distfn, arg)
|
||||
|
||||
alpha = 0.01
|
||||
check_discrete_chisquare(distfn, arg, rvs, alpha,
|
||||
distname + ' chisquare')
|
||||
|
||||
if first_case:
|
||||
locscale_defaults = (0,)
|
||||
meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
|
||||
distfn.logsf]
|
||||
# make sure arguments are within support
|
||||
spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
|
||||
k = spec_k.get(distname, 1)
|
||||
check_named_args(distfn, k, arg, locscale_defaults, meths)
|
||||
if distname != 'sample distribution':
|
||||
check_scale_docstring(distfn)
|
||||
check_random_state_property(distfn, arg)
|
||||
check_pickling(distfn, arg)
|
||||
|
||||
# Entropy
|
||||
check_entropy(distfn, arg, distname)
|
||||
if distfn.__class__._entropy != stats.rv_discrete._entropy:
|
||||
check_private_entropy(distfn, arg, stats.rv_discrete)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('distname,arg', distdiscrete)
|
||||
def test_moments(distname, arg):
|
||||
try:
|
||||
distfn = getattr(stats, distname)
|
||||
except TypeError:
|
||||
distfn = distname
|
||||
distname = 'sample distribution'
|
||||
m, v, s, k = distfn.stats(*arg, moments='mvsk')
|
||||
check_normalization(distfn, arg, distname)
|
||||
|
||||
# compare `stats` and `moment` methods
|
||||
check_moment(distfn, arg, m, v, distname)
|
||||
check_mean_expect(distfn, arg, m, distname)
|
||||
check_var_expect(distfn, arg, m, v, distname)
|
||||
check_skew_expect(distfn, arg, m, v, s, distname)
|
||||
if distname not in ['zipf', 'yulesimon']:
|
||||
check_kurt_expect(distfn, arg, m, v, k, distname)
|
||||
|
||||
# frozen distr moments
|
||||
check_moment_frozen(distfn, arg, m, 1)
|
||||
check_moment_frozen(distfn, arg, v+m*m, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dist,shape_args', distdiscrete)
|
||||
def test_rvs_broadcast(dist, shape_args):
|
||||
# If shape_only is True, it means the _rvs method of the
|
||||
# distribution uses more than one random number to generate a random
|
||||
# variate. That means the result of using rvs with broadcasting or
|
||||
# with a nontrivial size will not necessarily be the same as using the
|
||||
# numpy.vectorize'd version of rvs(), so we can only compare the shapes
|
||||
# of the results, not the values.
|
||||
# Whether or not a distribution is in the following list is an
|
||||
# implementation detail of the distribution, not a requirement. If
|
||||
# the implementation the rvs() method of a distribution changes, this
|
||||
# test might also have to be changed.
|
||||
shape_only = dist in ['skellam', 'yulesimon']
|
||||
|
||||
try:
|
||||
distfunc = getattr(stats, dist)
|
||||
except TypeError:
|
||||
distfunc = dist
|
||||
dist = 'rv_discrete(values=(%r, %r))' % (dist.xk, dist.pk)
|
||||
loc = np.zeros(2)
|
||||
nargs = distfunc.numargs
|
||||
allargs = []
|
||||
bshape = []
|
||||
# Generate shape parameter arguments...
|
||||
for k in range(nargs):
|
||||
shp = (k + 3,) + (1,)*(k + 1)
|
||||
param_val = shape_args[k]
|
||||
allargs.append(param_val*np.ones(shp, dtype=np.array(param_val).dtype))
|
||||
bshape.insert(0, shp[0])
|
||||
allargs.append(loc)
|
||||
bshape.append(loc.size)
|
||||
# bshape holds the expected shape when loc, scale, and the shape
|
||||
# parameters are all broadcast together.
|
||||
check_rvs_broadcast(distfunc, dist, allargs, bshape, shape_only, [np.int_])
|
||||
|
||||
|
||||
def check_cdf_ppf(distfn, arg, supp, msg):
|
||||
# cdf is a step function, and ppf(q) = min{k : cdf(k) >= q, k integer}
|
||||
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg), *arg),
|
||||
supp, msg + '-roundtrip')
|
||||
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg) - 1e-8, *arg),
|
||||
supp, msg + '-roundtrip')
|
||||
|
||||
if not hasattr(distfn, 'xk'):
|
||||
supp1 = supp[supp < distfn.b]
|
||||
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp1, *arg) + 1e-8, *arg),
|
||||
supp1 + distfn.inc, msg + ' ppf-cdf-next')
|
||||
# -1e-8 could cause an error if pmf < 1e-8
|
||||
|
||||
|
||||
def check_pmf_cdf(distfn, arg, distname):
|
||||
if hasattr(distfn, 'xk'):
|
||||
index = distfn.xk
|
||||
else:
|
||||
startind = int(distfn.ppf(0.01, *arg) - 1)
|
||||
index = list(range(startind, startind + 10))
|
||||
cdfs = distfn.cdf(index, *arg)
|
||||
pmfs_cum = distfn.pmf(index, *arg).cumsum()
|
||||
|
||||
atol, rtol = 1e-10, 1e-10
|
||||
if distname == 'skellam': # ncx2 accuracy
|
||||
atol, rtol = 1e-5, 1e-5
|
||||
npt.assert_allclose(cdfs - cdfs[0], pmfs_cum - pmfs_cum[0],
|
||||
atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
def check_moment_frozen(distfn, arg, m, k):
|
||||
npt.assert_allclose(distfn(*arg).moment(k), m,
|
||||
atol=1e-10, rtol=1e-10)
|
||||
|
||||
|
||||
def check_oth(distfn, arg, supp, msg):
|
||||
# checking other methods of distfn
|
||||
npt.assert_allclose(distfn.sf(supp, *arg), 1. - distfn.cdf(supp, *arg),
|
||||
atol=1e-10, rtol=1e-10)
|
||||
|
||||
q = np.linspace(0.01, 0.99, 20)
|
||||
npt.assert_allclose(distfn.isf(q, *arg), distfn.ppf(1. - q, *arg),
|
||||
atol=1e-10, rtol=1e-10)
|
||||
|
||||
median_sf = distfn.isf(0.5, *arg)
|
||||
npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5)
|
||||
npt.assert_(distfn.cdf(median_sf + 1, *arg) > 0.5)
|
||||
|
||||
|
||||
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
|
||||
"""Perform chisquare test for random sample of a discrete distribution
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distname : string
|
||||
name of distribution function
|
||||
arg : sequence
|
||||
parameters of distribution
|
||||
alpha : float
|
||||
significance level, threshold for p-value
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : bool
|
||||
0 if test passes, 1 if test fails
|
||||
|
||||
"""
|
||||
wsupp = 0.05
|
||||
|
||||
# construct intervals with minimum mass `wsupp`.
|
||||
# intervals are left-half-open as in a cdf difference
|
||||
lo = int(max(distfn.a, -1000))
|
||||
distsupport = xrange(lo, int(min(distfn.b, 1000)) + 1)
|
||||
last = 0
|
||||
distsupp = [lo]
|
||||
distmass = []
|
||||
for ii in distsupport:
|
||||
current = distfn.cdf(ii, *arg)
|
||||
if current - last >= wsupp - 1e-14:
|
||||
distsupp.append(ii)
|
||||
distmass.append(current - last)
|
||||
last = current
|
||||
if current > (1 - wsupp):
|
||||
break
|
||||
if distsupp[-1] < distfn.b:
|
||||
distsupp.append(distfn.b)
|
||||
distmass.append(1 - last)
|
||||
distsupp = np.array(distsupp)
|
||||
distmass = np.array(distmass)
|
||||
|
||||
# convert intervals to right-half-open as required by histogram
|
||||
histsupp = distsupp + 1e-8
|
||||
histsupp[0] = distfn.a
|
||||
|
||||
# find sample frequencies and perform chisquare test
|
||||
freq, hsupp = np.histogram(rvs, histsupp)
|
||||
chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)
|
||||
|
||||
npt.assert_(pval > alpha,
|
||||
'chisquare - test for %s at arg = %s with pval = %s' %
|
||||
(msg, str(arg), str(pval)))
|
||||
|
||||
|
||||
def check_scale_docstring(distfn):
|
||||
if distfn.__doc__ is not None:
|
||||
# Docstrings can be stripped if interpreter is run with -OO
|
||||
npt.assert_('scale' not in distfn.__doc__)
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from scipy.stats import hypergeom, bernoulli, boltzmann
|
||||
import numpy as np
|
||||
from numpy.testing import assert_almost_equal, assert_equal, assert_allclose
|
||||
|
||||
|
||||
def test_hypergeom_logpmf():
|
||||
# symmetries test
|
||||
# f(k,N,K,n) = f(n-k,N,N-K,n) = f(K-k,N,K,N-n) = f(k,N,n,K)
|
||||
k = 5
|
||||
N = 50
|
||||
K = 10
|
||||
n = 5
|
||||
logpmf1 = hypergeom.logpmf(k, N, K, n)
|
||||
logpmf2 = hypergeom.logpmf(n - k, N, N - K, n)
|
||||
logpmf3 = hypergeom.logpmf(K - k, N, K, N - n)
|
||||
logpmf4 = hypergeom.logpmf(k, N, n, K)
|
||||
assert_almost_equal(logpmf1, logpmf2, decimal=12)
|
||||
assert_almost_equal(logpmf1, logpmf3, decimal=12)
|
||||
assert_almost_equal(logpmf1, logpmf4, decimal=12)
|
||||
|
||||
# test related distribution
|
||||
# Bernoulli distribution if n = 1
|
||||
k = 1
|
||||
N = 10
|
||||
K = 7
|
||||
n = 1
|
||||
hypergeom_logpmf = hypergeom.logpmf(k, N, K, n)
|
||||
bernoulli_logpmf = bernoulli.logpmf(k, K/N)
|
||||
assert_almost_equal(hypergeom_logpmf, bernoulli_logpmf, decimal=12)
|
||||
|
||||
|
||||
def test_boltzmann_upper_bound():
|
||||
k = np.arange(-3, 5)
|
||||
|
||||
N = 1
|
||||
p = boltzmann.pmf(k, 0.123, N)
|
||||
expected = k == 0
|
||||
assert_equal(p, expected)
|
||||
|
||||
lam = np.log(2)
|
||||
N = 3
|
||||
p = boltzmann.pmf(k, lam, N)
|
||||
expected = [0, 0, 0, 4/7, 2/7, 1/7, 0, 0]
|
||||
assert_allclose(p, expected, rtol=1e-13)
|
||||
|
||||
c = boltzmann.cdf(k, lam, N)
|
||||
expected = [0, 0, 0, 4/7, 6/7, 1, 1, 1]
|
||||
assert_allclose(c, expected, rtol=1e-13)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,122 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy._lib._numpy_compat import suppress_warnings
|
||||
import pytest
|
||||
from scipy import stats
|
||||
|
||||
from .test_continuous_basic import distcont
|
||||
|
||||
# this is not a proper statistical test for convergence, but only
|
||||
# verifies that the estimate and true values don't differ by too much
|
||||
|
||||
fit_sizes = [1000, 5000] # sample sizes to try
|
||||
|
||||
thresh_percent = 0.25 # percent of true parameters for fail cut-off
|
||||
thresh_min = 0.75 # minimum difference estimate - true to fail test
|
||||
|
||||
failing_fits = [
|
||||
'burr',
|
||||
'chi2',
|
||||
'gausshyper',
|
||||
'genexpon',
|
||||
'gengamma',
|
||||
'kappa4',
|
||||
'ksone',
|
||||
'mielke',
|
||||
'ncf',
|
||||
'ncx2',
|
||||
'pearson3',
|
||||
'powerlognorm',
|
||||
'truncexpon',
|
||||
'tukeylambda',
|
||||
'vonmises',
|
||||
'wrapcauchy',
|
||||
'levy_stable',
|
||||
'trapz'
|
||||
]
|
||||
|
||||
# Don't run the fit test on these:
|
||||
skip_fit = [
|
||||
'erlang', # Subclass of gamma, generates a warning.
|
||||
]
|
||||
|
||||
|
||||
def cases_test_cont_fit():
|
||||
# this tests the closeness of the estimated parameters to the true
|
||||
# parameters with fit method of continuous distributions
|
||||
# Note: is slow, some distributions don't converge with sample size <= 10000
|
||||
for distname, arg in distcont:
|
||||
if distname not in skip_fit:
|
||||
yield distname, arg
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize('distname,arg', cases_test_cont_fit())
|
||||
def test_cont_fit(distname, arg):
|
||||
if distname in failing_fits:
|
||||
# Skip failing fits unless overridden
|
||||
try:
|
||||
xfail = not int(os.environ['SCIPY_XFAIL'])
|
||||
except Exception:
|
||||
xfail = True
|
||||
if xfail:
|
||||
msg = "Fitting %s doesn't work reliably yet" % distname
|
||||
msg += " [Set environment variable SCIPY_XFAIL=1 to run this test nevertheless.]"
|
||||
pytest.xfail(msg)
|
||||
|
||||
distfn = getattr(stats, distname)
|
||||
|
||||
truearg = np.hstack([arg, [0.0, 1.0]])
|
||||
diffthreshold = np.max(np.vstack([truearg*thresh_percent,
|
||||
np.ones(distfn.numargs+2)*thresh_min]),
|
||||
0)
|
||||
|
||||
for fit_size in fit_sizes:
|
||||
# Note that if a fit succeeds, the other fit_sizes are skipped
|
||||
np.random.seed(1234)
|
||||
|
||||
with np.errstate(all='ignore'), suppress_warnings() as sup:
|
||||
sup.filter(category=DeprecationWarning, message=".*frechet_")
|
||||
rvs = distfn.rvs(size=fit_size, *arg)
|
||||
est = distfn.fit(rvs) # start with default values
|
||||
|
||||
diff = est - truearg
|
||||
|
||||
# threshold for location
|
||||
diffthreshold[-2] = np.max([np.abs(rvs.mean())*thresh_percent,thresh_min])
|
||||
|
||||
if np.any(np.isnan(est)):
|
||||
raise AssertionError('nan returned in fit')
|
||||
else:
|
||||
if np.all(np.abs(diff) <= diffthreshold):
|
||||
break
|
||||
else:
|
||||
txt = 'parameter: %s\n' % str(truearg)
|
||||
txt += 'estimated: %s\n' % str(est)
|
||||
txt += 'diff : %s\n' % str(diff)
|
||||
raise AssertionError('fit not very good in %s\n' % distfn.name + txt)
|
||||
|
||||
|
||||
def _check_loc_scale_mle_fit(name, data, desired, atol=None):
|
||||
d = getattr(stats, name)
|
||||
actual = d.fit(data)[-2:]
|
||||
assert_allclose(actual, desired, atol=atol,
|
||||
err_msg='poor mle fit of (loc, scale) in %s' % name)
|
||||
|
||||
|
||||
def test_non_default_loc_scale_mle_fit():
|
||||
data = np.array([1.01, 1.78, 1.78, 1.78, 1.88, 1.88, 1.88, 2.00])
|
||||
_check_loc_scale_mle_fit('uniform', data, [1.01, 0.99], 1e-3)
|
||||
_check_loc_scale_mle_fit('expon', data, [1.01, 0.73875], 1e-3)
|
||||
|
||||
|
||||
def test_expon_fit():
|
||||
"""gh-6167"""
|
||||
data = [0, 0, 0, 0, 2, 2, 2, 2]
|
||||
phat = stats.expon.fit(data, floc=0)
|
||||
assert_allclose(phat, [0, 1.0], atol=1e-3)
|
||||
|
||||
@@ -1,368 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
from scipy import stats
|
||||
import numpy as np
|
||||
from numpy.testing import (assert_almost_equal, assert_,
|
||||
assert_array_almost_equal, assert_array_almost_equal_nulp)
|
||||
import pytest
|
||||
from pytest import raises as assert_raises
|
||||
|
||||
|
||||
def test_kde_1d():
|
||||
#some basic tests comparing to normal distribution
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 500
|
||||
xn = np.random.randn(n_basesample)
|
||||
xnmean = xn.mean()
|
||||
xnstd = xn.std(ddof=1)
|
||||
|
||||
# get kde for original sample
|
||||
gkde = stats.gaussian_kde(xn)
|
||||
|
||||
# evaluate the density function for the kde for some points
|
||||
xs = np.linspace(-7,7,501)
|
||||
kdepdf = gkde.evaluate(xs)
|
||||
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
|
||||
intervall = xs[1] - xs[0]
|
||||
|
||||
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
|
||||
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
|
||||
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
|
||||
assert_almost_equal(prob1, 0.5, decimal=1)
|
||||
assert_almost_equal(prob2, 0.5, decimal=1)
|
||||
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
|
||||
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
|
||||
|
||||
assert_almost_equal(gkde.integrate_kde(gkde),
|
||||
(kdepdf**2).sum()*intervall, decimal=2)
|
||||
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
|
||||
(kdepdf*normpdf).sum()*intervall, decimal=2)
|
||||
|
||||
|
||||
def test_kde_1d_weighted():
|
||||
#some basic tests comparing to normal distribution
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 500
|
||||
xn = np.random.randn(n_basesample)
|
||||
wn = np.random.rand(n_basesample)
|
||||
xnmean = np.average(xn, weights=wn)
|
||||
xnstd = np.sqrt(np.average((xn-xnmean)**2, weights=wn))
|
||||
|
||||
# get kde for original sample
|
||||
gkde = stats.gaussian_kde(xn, weights=wn)
|
||||
|
||||
# evaluate the density function for the kde for some points
|
||||
xs = np.linspace(-7,7,501)
|
||||
kdepdf = gkde.evaluate(xs)
|
||||
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
|
||||
intervall = xs[1] - xs[0]
|
||||
|
||||
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
|
||||
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
|
||||
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
|
||||
assert_almost_equal(prob1, 0.5, decimal=1)
|
||||
assert_almost_equal(prob2, 0.5, decimal=1)
|
||||
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
|
||||
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
|
||||
|
||||
assert_almost_equal(gkde.integrate_kde(gkde),
|
||||
(kdepdf**2).sum()*intervall, decimal=2)
|
||||
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
|
||||
(kdepdf*normpdf).sum()*intervall, decimal=2)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_kde_2d():
|
||||
#some basic tests comparing to normal distribution
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 500
|
||||
|
||||
mean = np.array([1.0, 3.0])
|
||||
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
|
||||
|
||||
# Need transpose (shape (2, 500)) for kde
|
||||
xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
|
||||
|
||||
# get kde for original sample
|
||||
gkde = stats.gaussian_kde(xn)
|
||||
|
||||
# evaluate the density function for the kde for some points
|
||||
x, y = np.mgrid[-7:7:500j, -7:7:500j]
|
||||
grid_coords = np.vstack([x.ravel(), y.ravel()])
|
||||
kdepdf = gkde.evaluate(grid_coords)
|
||||
kdepdf = kdepdf.reshape(500, 500)
|
||||
|
||||
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
|
||||
intervall = y.ravel()[1] - y.ravel()[0]
|
||||
|
||||
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
|
||||
|
||||
small = -1e100
|
||||
large = 1e100
|
||||
prob1 = gkde.integrate_box([small, mean[1]], [large, large])
|
||||
prob2 = gkde.integrate_box([small, small], [large, mean[1]])
|
||||
|
||||
assert_almost_equal(prob1, 0.5, decimal=1)
|
||||
assert_almost_equal(prob2, 0.5, decimal=1)
|
||||
assert_almost_equal(gkde.integrate_kde(gkde),
|
||||
(kdepdf**2).sum()*(intervall**2), decimal=2)
|
||||
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
|
||||
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_kde_2d_weighted():
|
||||
#some basic tests comparing to normal distribution
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 500
|
||||
|
||||
mean = np.array([1.0, 3.0])
|
||||
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
|
||||
|
||||
# Need transpose (shape (2, 500)) for kde
|
||||
xn = np.random.multivariate_normal(mean, covariance, size=n_basesample).T
|
||||
wn = np.random.rand(n_basesample)
|
||||
|
||||
# get kde for original sample
|
||||
gkde = stats.gaussian_kde(xn, weights=wn)
|
||||
|
||||
# evaluate the density function for the kde for some points
|
||||
x, y = np.mgrid[-7:7:500j, -7:7:500j]
|
||||
grid_coords = np.vstack([x.ravel(), y.ravel()])
|
||||
kdepdf = gkde.evaluate(grid_coords)
|
||||
kdepdf = kdepdf.reshape(500, 500)
|
||||
|
||||
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]), mean=mean, cov=covariance)
|
||||
intervall = y.ravel()[1] - y.ravel()[0]
|
||||
|
||||
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
|
||||
|
||||
small = -1e100
|
||||
large = 1e100
|
||||
prob1 = gkde.integrate_box([small, mean[1]], [large, large])
|
||||
prob2 = gkde.integrate_box([small, small], [large, mean[1]])
|
||||
|
||||
assert_almost_equal(prob1, 0.5, decimal=1)
|
||||
assert_almost_equal(prob2, 0.5, decimal=1)
|
||||
assert_almost_equal(gkde.integrate_kde(gkde),
|
||||
(kdepdf**2).sum()*(intervall**2), decimal=2)
|
||||
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
|
||||
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
|
||||
|
||||
|
||||
def test_kde_bandwidth_method():
|
||||
def scotts_factor(kde_obj):
|
||||
"""Same as default, just check that it works."""
|
||||
return np.power(kde_obj.n, -1./(kde_obj.d+4))
|
||||
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 50
|
||||
xn = np.random.randn(n_basesample)
|
||||
|
||||
# Default
|
||||
gkde = stats.gaussian_kde(xn)
|
||||
# Supply a callable
|
||||
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
|
||||
# Supply a scalar
|
||||
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
|
||||
|
||||
xs = np.linspace(-7,7,51)
|
||||
kdepdf = gkde.evaluate(xs)
|
||||
kdepdf2 = gkde2.evaluate(xs)
|
||||
assert_almost_equal(kdepdf, kdepdf2)
|
||||
kdepdf3 = gkde3.evaluate(xs)
|
||||
assert_almost_equal(kdepdf, kdepdf3)
|
||||
|
||||
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
|
||||
|
||||
|
||||
def test_kde_bandwidth_method_weighted():
|
||||
def scotts_factor(kde_obj):
|
||||
"""Same as default, just check that it works."""
|
||||
return np.power(kde_obj.neff, -1./(kde_obj.d+4))
|
||||
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 50
|
||||
xn = np.random.randn(n_basesample)
|
||||
|
||||
# Default
|
||||
gkde = stats.gaussian_kde(xn)
|
||||
# Supply a callable
|
||||
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
|
||||
# Supply a scalar
|
||||
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
|
||||
|
||||
xs = np.linspace(-7,7,51)
|
||||
kdepdf = gkde.evaluate(xs)
|
||||
kdepdf2 = gkde2.evaluate(xs)
|
||||
assert_almost_equal(kdepdf, kdepdf2)
|
||||
kdepdf3 = gkde3.evaluate(xs)
|
||||
assert_almost_equal(kdepdf, kdepdf3)
|
||||
|
||||
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
|
||||
|
||||
|
||||
# Subclasses that should stay working (extracted from various sources).
|
||||
# Unfortunately the earlier design of gaussian_kde made it necessary for users
|
||||
# to create these kinds of subclasses, or call _compute_covariance() directly.
|
||||
|
||||
class _kde_subclass1(stats.gaussian_kde):
|
||||
def __init__(self, dataset):
|
||||
self.dataset = np.atleast_2d(dataset)
|
||||
self.d, self.n = self.dataset.shape
|
||||
self.covariance_factor = self.scotts_factor
|
||||
self._compute_covariance()
|
||||
|
||||
|
||||
class _kde_subclass2(stats.gaussian_kde):
|
||||
def __init__(self, dataset):
|
||||
self.covariance_factor = self.scotts_factor
|
||||
super(_kde_subclass2, self).__init__(dataset)
|
||||
|
||||
|
||||
class _kde_subclass3(stats.gaussian_kde):
|
||||
def __init__(self, dataset, covariance):
|
||||
self.covariance = covariance
|
||||
stats.gaussian_kde.__init__(self, dataset)
|
||||
|
||||
def _compute_covariance(self):
|
||||
self.inv_cov = np.linalg.inv(self.covariance)
|
||||
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi * self.covariance)) \
|
||||
* self.n
|
||||
|
||||
|
||||
class _kde_subclass4(stats.gaussian_kde):
|
||||
def covariance_factor(self):
|
||||
return 0.5 * self.silverman_factor()
|
||||
|
||||
|
||||
def test_gaussian_kde_subclassing():
|
||||
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
|
||||
xs = np.linspace(-10, 10, num=50)
|
||||
|
||||
# gaussian_kde itself
|
||||
kde = stats.gaussian_kde(x1)
|
||||
ys = kde(xs)
|
||||
|
||||
# subclass 1
|
||||
kde1 = _kde_subclass1(x1)
|
||||
y1 = kde1(xs)
|
||||
assert_array_almost_equal_nulp(ys, y1, nulp=10)
|
||||
|
||||
# subclass 2
|
||||
kde2 = _kde_subclass2(x1)
|
||||
y2 = kde2(xs)
|
||||
assert_array_almost_equal_nulp(ys, y2, nulp=10)
|
||||
|
||||
# subclass 3
|
||||
kde3 = _kde_subclass3(x1, kde.covariance)
|
||||
y3 = kde3(xs)
|
||||
assert_array_almost_equal_nulp(ys, y3, nulp=10)
|
||||
|
||||
# subclass 4
|
||||
kde4 = _kde_subclass4(x1)
|
||||
y4 = kde4(x1)
|
||||
y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
|
||||
|
||||
assert_array_almost_equal(y_expected, y4, decimal=6)
|
||||
|
||||
# Not a subclass, but check for use of _compute_covariance()
|
||||
kde5 = kde
|
||||
kde5.covariance_factor = lambda: kde.factor
|
||||
kde5._compute_covariance()
|
||||
y5 = kde5(xs)
|
||||
assert_array_almost_equal_nulp(ys, y5, nulp=10)
|
||||
|
||||
|
||||
def test_gaussian_kde_covariance_caching():
|
||||
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
|
||||
xs = np.linspace(-10, 10, num=5)
|
||||
# These expected values are from scipy 0.10, before some changes to
|
||||
# gaussian_kde. They were not compared with any external reference.
|
||||
y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
|
||||
|
||||
# Set the bandwidth, then reset it to the default.
|
||||
kde = stats.gaussian_kde(x1)
|
||||
kde.set_bandwidth(bw_method=0.5)
|
||||
kde.set_bandwidth(bw_method='scott')
|
||||
y2 = kde(xs)
|
||||
|
||||
assert_array_almost_equal(y_expected, y2, decimal=7)
|
||||
|
||||
|
||||
def test_gaussian_kde_monkeypatch():
|
||||
"""Ugly, but people may rely on this. See scipy pull request 123,
|
||||
specifically the linked ML thread "Width of the Gaussian in stats.kde".
|
||||
If it is necessary to break this later on, that is to be discussed on ML.
|
||||
"""
|
||||
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
|
||||
xs = np.linspace(-10, 10, num=50)
|
||||
|
||||
# The old monkeypatched version to get at Silverman's Rule.
|
||||
kde = stats.gaussian_kde(x1)
|
||||
kde.covariance_factor = kde.silverman_factor
|
||||
kde._compute_covariance()
|
||||
y1 = kde(xs)
|
||||
|
||||
# The new saner version.
|
||||
kde2 = stats.gaussian_kde(x1, bw_method='silverman')
|
||||
y2 = kde2(xs)
|
||||
|
||||
assert_array_almost_equal_nulp(y1, y2, nulp=10)
|
||||
|
||||
|
||||
def test_kde_integer_input():
|
||||
"""Regression test for #1181."""
|
||||
x1 = np.arange(5)
|
||||
kde = stats.gaussian_kde(x1)
|
||||
y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
|
||||
assert_array_almost_equal(kde(x1), y_expected, decimal=6)
|
||||
|
||||
|
||||
def test_pdf_logpdf():
|
||||
np.random.seed(1)
|
||||
n_basesample = 50
|
||||
xn = np.random.randn(n_basesample)
|
||||
|
||||
# Default
|
||||
gkde = stats.gaussian_kde(xn)
|
||||
|
||||
xs = np.linspace(-15, 12, 25)
|
||||
pdf = gkde.evaluate(xs)
|
||||
pdf2 = gkde.pdf(xs)
|
||||
assert_almost_equal(pdf, pdf2, decimal=12)
|
||||
|
||||
logpdf = np.log(pdf)
|
||||
logpdf2 = gkde.logpdf(xs)
|
||||
assert_almost_equal(logpdf, logpdf2, decimal=12)
|
||||
|
||||
# There are more points than data
|
||||
gkde = stats.gaussian_kde(xs)
|
||||
pdf = np.log(gkde.evaluate(xn))
|
||||
pdf2 = gkde.logpdf(xn)
|
||||
assert_almost_equal(pdf, pdf2, decimal=12)
|
||||
|
||||
|
||||
def test_pdf_logpdf_weighted():
|
||||
np.random.seed(1)
|
||||
n_basesample = 50
|
||||
xn = np.random.randn(n_basesample)
|
||||
wn = np.random.rand(n_basesample)
|
||||
|
||||
# Default
|
||||
gkde = stats.gaussian_kde(xn, weights=wn)
|
||||
|
||||
xs = np.linspace(-15, 12, 25)
|
||||
pdf = gkde.evaluate(xs)
|
||||
pdf2 = gkde.pdf(xs)
|
||||
assert_almost_equal(pdf, pdf2, decimal=12)
|
||||
|
||||
logpdf = np.log(pdf)
|
||||
logpdf2 = gkde.logpdf(xs)
|
||||
assert_almost_equal(logpdf, logpdf2, decimal=12)
|
||||
|
||||
# There are more points than data
|
||||
gkde = stats.gaussian_kde(xs)
|
||||
pdf = np.log(gkde.evaluate(xn))
|
||||
pdf2 = gkde.logpdf(xn)
|
||||
assert_almost_equal(pdf, pdf2, decimal=12)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,136 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
import numpy.ma as ma
|
||||
import scipy.stats.mstats as ms
|
||||
|
||||
from numpy.testing import (assert_equal, assert_almost_equal, assert_,
|
||||
assert_allclose)
|
||||
|
||||
|
||||
def test_compare_medians_ms():
|
||||
x = np.arange(7)
|
||||
y = x + 10
|
||||
assert_almost_equal(ms.compare_medians_ms(x, y), 0)
|
||||
|
||||
y2 = np.linspace(0, 1, num=10)
|
||||
assert_almost_equal(ms.compare_medians_ms(x, y2), 0.017116406778)
|
||||
|
||||
|
||||
def test_hdmedian():
|
||||
# 1-D array
|
||||
x = ma.arange(11)
|
||||
assert_allclose(ms.hdmedian(x), 5, rtol=1e-14)
|
||||
x.mask = ma.make_mask(x)
|
||||
x.mask[:7] = False
|
||||
assert_allclose(ms.hdmedian(x), 3, rtol=1e-14)
|
||||
|
||||
# Check that `var` keyword returns a value. TODO: check whether returned
|
||||
# value is actually correct.
|
||||
assert_(ms.hdmedian(x, var=True).size == 2)
|
||||
|
||||
# 2-D array
|
||||
x2 = ma.arange(22).reshape((11, 2))
|
||||
assert_allclose(ms.hdmedian(x2, axis=0), [10, 11])
|
||||
x2.mask = ma.make_mask(x2)
|
||||
x2.mask[:7, :] = False
|
||||
assert_allclose(ms.hdmedian(x2, axis=0), [6, 7])
|
||||
|
||||
|
||||
def test_rsh():
|
||||
np.random.seed(132345)
|
||||
x = np.random.randn(100)
|
||||
res = ms.rsh(x)
|
||||
# Just a sanity check that the code runs and output shape is correct.
|
||||
# TODO: check that implementation is correct.
|
||||
assert_(res.shape == x.shape)
|
||||
|
||||
# Check points keyword
|
||||
res = ms.rsh(x, points=[0, 1.])
|
||||
assert_(res.size == 2)
|
||||
|
||||
|
||||
def test_mjci():
|
||||
# Tests the Marits-Jarrett estimator
|
||||
data = ma.array([77, 87, 88,114,151,210,219,246,253,262,
|
||||
296,299,306,376,428,515,666,1310,2611])
|
||||
assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5)
|
||||
|
||||
|
||||
def test_trimmed_mean_ci():
|
||||
# Tests the confidence intervals of the trimmed mean.
|
||||
data = ma.array([545,555,558,572,575,576,578,580,
|
||||
594,605,635,651,653,661,666])
|
||||
assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1)
|
||||
assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1),
|
||||
[561.8, 630.6])
|
||||
|
||||
|
||||
def test_idealfourths():
|
||||
# Tests ideal-fourths
|
||||
test = np.arange(100)
|
||||
assert_almost_equal(np.asarray(ms.idealfourths(test)),
|
||||
[24.416667,74.583333],6)
|
||||
test_2D = test.repeat(3).reshape(-1,3)
|
||||
assert_almost_equal(ms.idealfourths(test_2D, axis=0),
|
||||
[[24.416667,24.416667,24.416667],
|
||||
[74.583333,74.583333,74.583333]],6)
|
||||
assert_almost_equal(ms.idealfourths(test_2D, axis=1),
|
||||
test.repeat(2).reshape(-1,2))
|
||||
test = [0, 0]
|
||||
_result = ms.idealfourths(test)
|
||||
assert_(np.isnan(_result).all())
|
||||
|
||||
|
||||
class TestQuantiles(object):
|
||||
data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
|
||||
0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
|
||||
0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
|
||||
0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
|
||||
0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
|
||||
0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
|
||||
0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
|
||||
0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
|
||||
0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
|
||||
0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
|
||||
0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
|
||||
0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
|
||||
0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
|
||||
0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
|
||||
0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
|
||||
0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
|
||||
0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
|
||||
0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
|
||||
0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
|
||||
0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
|
||||
|
||||
def test_hdquantiles(self):
|
||||
data = self.data
|
||||
assert_almost_equal(ms.hdquantiles(data,[0., 1.]),
|
||||
[0.006514031, 0.995309248])
|
||||
hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75])
|
||||
assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
|
||||
hdq = ms.hdquantiles_sd(data,[0.25, 0.5, 0.75])
|
||||
assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4)
|
||||
|
||||
data = np.array(data).reshape(10,10)
|
||||
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0)
|
||||
assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75]))
|
||||
assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75]))
|
||||
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
|
||||
assert_almost_equal(hdq[...,0],
|
||||
ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
|
||||
assert_almost_equal(hdq[...,-1],
|
||||
ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
|
||||
|
||||
def test_hdquantiles_sd(self):
|
||||
# Only test that code runs, implementation not checked for correctness
|
||||
res = ms.hdquantiles_sd(self.data)
|
||||
assert_(res.size == 3)
|
||||
|
||||
def test_mquantiles_cimj(self):
|
||||
# Only test that code runs, implementation not checked for correctness
|
||||
ci_lower, ci_upper = ms.mquantiles_cimj(self.data)
|
||||
assert_(ci_lower.size == ci_upper.size == 3)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,218 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_equal, assert_array_equal
|
||||
|
||||
from scipy.stats import rankdata, tiecorrect
|
||||
|
||||
|
||||
class TestTieCorrect(object):
|
||||
|
||||
def test_empty(self):
|
||||
"""An empty array requires no correction, should return 1.0."""
|
||||
ranks = np.array([], dtype=np.float64)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
|
||||
def test_one(self):
|
||||
"""A single element requires no correction, should return 1.0."""
|
||||
ranks = np.array([1.0], dtype=np.float64)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
|
||||
def test_no_correction(self):
|
||||
"""Arrays with no ties require no correction."""
|
||||
ranks = np.arange(2.0)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
ranks = np.arange(3.0)
|
||||
c = tiecorrect(ranks)
|
||||
assert_equal(c, 1.0)
|
||||
|
||||
def test_basic(self):
|
||||
"""Check a few basic examples of the tie correction factor."""
|
||||
# One tie of two elements
|
||||
ranks = np.array([1.0, 2.5, 2.5])
|
||||
c = tiecorrect(ranks)
|
||||
T = 2.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
# One tie of two elements (same as above, but tie is not at the end)
|
||||
ranks = np.array([1.5, 1.5, 3.0])
|
||||
c = tiecorrect(ranks)
|
||||
T = 2.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
# One tie of three elements
|
||||
ranks = np.array([1.0, 3.0, 3.0, 3.0])
|
||||
c = tiecorrect(ranks)
|
||||
T = 3.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - (T**3 - T) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
# Two ties, lengths 2 and 3.
|
||||
ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
|
||||
c = tiecorrect(ranks)
|
||||
T1 = 2.0
|
||||
T2 = 3.0
|
||||
N = ranks.size
|
||||
expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
|
||||
assert_equal(c, expected)
|
||||
|
||||
def test_overflow(self):
|
||||
ntie, k = 2000, 5
|
||||
a = np.repeat(np.arange(k), ntie)
|
||||
n = a.size # ntie * k
|
||||
out = tiecorrect(rankdata(a))
|
||||
assert_equal(out, 1.0 - k * (ntie**3 - ntie) / float(n**3 - n))
|
||||
|
||||
|
||||
class TestRankData(object):
|
||||
|
||||
def test_empty(self):
|
||||
"""stats.rankdata([]) should return an empty array."""
|
||||
a = np.array([], dtype=int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, np.array([], dtype=np.float64))
|
||||
r = rankdata([])
|
||||
assert_array_equal(r, np.array([], dtype=np.float64))
|
||||
|
||||
def test_one(self):
|
||||
"""Check stats.rankdata with an array of length 1."""
|
||||
data = [100]
|
||||
a = np.array(data, dtype=int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, np.array([1.0], dtype=np.float64))
|
||||
|
||||
def test_basic(self):
|
||||
"""Basic tests of stats.rankdata."""
|
||||
data = [100, 10, 50]
|
||||
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
|
||||
a = np.array(data, dtype=int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, expected)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, expected)
|
||||
|
||||
data = [40, 10, 30, 10, 50]
|
||||
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
|
||||
a = np.array(data, dtype=int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, expected)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, expected)
|
||||
|
||||
data = [20, 20, 20, 10, 10, 10]
|
||||
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
|
||||
a = np.array(data, dtype=int)
|
||||
r = rankdata(a)
|
||||
assert_array_equal(r, expected)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, expected)
|
||||
# The docstring states explicitly that the argument is flattened.
|
||||
a2d = a.reshape(2, 3)
|
||||
r = rankdata(a2d)
|
||||
assert_array_equal(r, expected)
|
||||
|
||||
def test_rankdata_object_string(self):
|
||||
min_rank = lambda a: [1 + sum(i < j for i in a) for j in a]
|
||||
max_rank = lambda a: [sum(i <= j for i in a) for j in a]
|
||||
ordinal_rank = lambda a: min_rank([(x, i) for i, x in enumerate(a)])
|
||||
|
||||
def average_rank(a):
|
||||
return [(i + j) / 2.0 for i, j in zip(min_rank(a), max_rank(a))]
|
||||
|
||||
def dense_rank(a):
|
||||
b = np.unique(a)
|
||||
return [1 + sum(i < j for i in b) for j in a]
|
||||
|
||||
rankf = dict(min=min_rank, max=max_rank, ordinal=ordinal_rank,
|
||||
average=average_rank, dense=dense_rank)
|
||||
|
||||
def check_ranks(a):
|
||||
for method in 'min', 'max', 'dense', 'ordinal', 'average':
|
||||
out = rankdata(a, method=method)
|
||||
assert_array_equal(out, rankf[method](a))
|
||||
|
||||
val = ['foo', 'bar', 'qux', 'xyz', 'abc', 'efg', 'ace', 'qwe', 'qaz']
|
||||
check_ranks(np.random.choice(val, 200))
|
||||
check_ranks(np.random.choice(val, 200).astype('object'))
|
||||
|
||||
val = np.array([0, 1, 2, 2.718, 3, 3.141], dtype='object')
|
||||
check_ranks(np.random.choice(val, 200).astype('object'))
|
||||
|
||||
def test_large_int(self):
|
||||
data = np.array([2**60, 2**60+1], dtype=np.uint64)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, [1.0, 2.0])
|
||||
|
||||
data = np.array([2**60, 2**60+1], dtype=np.int64)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, [1.0, 2.0])
|
||||
|
||||
data = np.array([2**60, -2**60+1], dtype=np.int64)
|
||||
r = rankdata(data)
|
||||
assert_array_equal(r, [2.0, 1.0])
|
||||
|
||||
def test_big_tie(self):
|
||||
for n in [10000, 100000, 1000000]:
|
||||
data = np.ones(n, dtype=int)
|
||||
r = rankdata(data)
|
||||
expected_rank = 0.5 * (n + 1)
|
||||
assert_array_equal(r, expected_rank * data,
|
||||
"test failed with n=%d" % n)
|
||||
|
||||
|
||||
_cases = (
|
||||
# values, method, expected
|
||||
([], 'average', []),
|
||||
([], 'min', []),
|
||||
([], 'max', []),
|
||||
([], 'dense', []),
|
||||
([], 'ordinal', []),
|
||||
#
|
||||
([100], 'average', [1.0]),
|
||||
([100], 'min', [1.0]),
|
||||
([100], 'max', [1.0]),
|
||||
([100], 'dense', [1.0]),
|
||||
([100], 'ordinal', [1.0]),
|
||||
#
|
||||
([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
|
||||
([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
|
||||
([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
|
||||
([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
|
||||
([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
|
||||
#
|
||||
([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
|
||||
([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
|
||||
#
|
||||
([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
|
||||
([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
|
||||
([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
|
||||
([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
|
||||
([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
|
||||
#
|
||||
([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
|
||||
([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
|
||||
([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
|
||||
([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
|
||||
([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
|
||||
#
|
||||
([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
|
||||
)
|
||||
|
||||
|
||||
def test_cases():
|
||||
for values, method, expected in _cases:
|
||||
r = rankdata(values, method=method)
|
||||
assert_array_equal(r, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,88 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose, assert_equal
|
||||
|
||||
from scipy.stats._tukeylambda_stats import (tukeylambda_variance,
|
||||
tukeylambda_kurtosis)
|
||||
|
||||
|
||||
def test_tukeylambda_stats_known_exact():
|
||||
"""Compare results with some known exact formulas."""
|
||||
# Some exact values of the Tukey Lambda variance and kurtosis:
|
||||
# lambda var kurtosis
|
||||
# 0 pi**2/3 6/5 (logistic distribution)
|
||||
# 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3
|
||||
# 1 1/3 -6/5 (uniform distribution on (-1,1))
|
||||
# 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2))
|
||||
|
||||
# lambda = 0
|
||||
var = tukeylambda_variance(0)
|
||||
assert_allclose(var, np.pi**2 / 3, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(0)
|
||||
assert_allclose(kurt, 1.2, atol=1e-10)
|
||||
|
||||
# lambda = 0.5
|
||||
var = tukeylambda_variance(0.5)
|
||||
assert_allclose(var, 4 - np.pi, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(0.5)
|
||||
desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3
|
||||
assert_allclose(kurt, desired, atol=1e-10)
|
||||
|
||||
# lambda = 1
|
||||
var = tukeylambda_variance(1)
|
||||
assert_allclose(var, 1.0 / 3, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(1)
|
||||
assert_allclose(kurt, -1.2, atol=1e-10)
|
||||
|
||||
# lambda = 2
|
||||
var = tukeylambda_variance(2)
|
||||
assert_allclose(var, 1.0 / 12, atol=1e-12)
|
||||
kurt = tukeylambda_kurtosis(2)
|
||||
assert_allclose(kurt, -1.2, atol=1e-10)
|
||||
|
||||
|
||||
def test_tukeylambda_stats_mpmath():
|
||||
"""Compare results with some values that were computed using mpmath."""
|
||||
a10 = dict(atol=1e-10, rtol=0)
|
||||
a12 = dict(atol=1e-12, rtol=0)
|
||||
data = [
|
||||
# lambda variance kurtosis
|
||||
[-0.1, 4.78050217874253547, 3.78559520346454510],
|
||||
[-0.0649, 4.16428023599895777, 2.52019675947435718],
|
||||
[-0.05, 3.93672267890775277, 2.13129793057777277],
|
||||
[-0.001, 3.30128380390964882, 1.21452460083542988],
|
||||
[0.001, 3.27850775649572176, 1.18560634779287585],
|
||||
[0.03125, 2.95927803254615800, 0.804487555161819980],
|
||||
[0.05, 2.78281053405464501, 0.611604043886644327],
|
||||
[0.0649, 2.65282386754100551, 0.476834119532774540],
|
||||
[1.2, 0.242153920578588346, -1.23428047169049726],
|
||||
[10.0, 0.00095237579757703597, 2.37810697355144933],
|
||||
[20.0, 0.00012195121951131043, 7.37654321002709531],
|
||||
]
|
||||
|
||||
for lam, var_expected, kurt_expected in data:
|
||||
var = tukeylambda_variance(lam)
|
||||
assert_allclose(var, var_expected, **a12)
|
||||
kurt = tukeylambda_kurtosis(lam)
|
||||
assert_allclose(kurt, kurt_expected, **a10)
|
||||
|
||||
# Test with vector arguments (most of the other tests are for single
|
||||
# values).
|
||||
lam, var_expected, kurt_expected = zip(*data)
|
||||
var = tukeylambda_variance(lam)
|
||||
assert_allclose(var, var_expected, **a12)
|
||||
kurt = tukeylambda_kurtosis(lam)
|
||||
assert_allclose(kurt, kurt_expected, **a10)
|
||||
|
||||
|
||||
def test_tukeylambda_stats_invalid():
|
||||
"""Test values of lambda outside the domains of the functions."""
|
||||
lam = [-1.0, -0.5]
|
||||
var = tukeylambda_variance(lam)
|
||||
assert_equal(var, np.array([np.nan, np.inf]))
|
||||
|
||||
lam = [-1.0, -0.25]
|
||||
kurt = tukeylambda_kurtosis(lam)
|
||||
assert_equal(kurt, np.array([np.nan, np.inf]))
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
from __future__ import division, print_function, absolute_import
|
||||
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
from scipy.special import i0
|
||||
|
||||
|
||||
def von_mises_cdf_series(k,x,p):
|
||||
x = float(x)
|
||||
s = np.sin(x)
|
||||
c = np.cos(x)
|
||||
sn = np.sin(p*x)
|
||||
cn = np.cos(p*x)
|
||||
R = 0
|
||||
V = 0
|
||||
for n in range(p-1,0,-1):
|
||||
sn, cn = sn*c - cn*s, cn*c + sn*s
|
||||
R = 1./(2*n/k + R)
|
||||
V = R*(sn/n+V)
|
||||
|
||||
return 0.5+x/(2*np.pi) + V/np.pi
|
||||
|
||||
|
||||
def von_mises_cdf_normalapprox(k, x):
|
||||
b = np.sqrt(2/np.pi)*np.exp(k)/i0(k)
|
||||
z = b*np.sin(x/2.)
|
||||
return scipy.stats.norm.cdf(z)
|
||||
|
||||
|
||||
def von_mises_cdf(k,x):
|
||||
ix = 2*np.pi*np.round(x/(2*np.pi))
|
||||
x = x-ix
|
||||
k = float(k)
|
||||
|
||||
# These values should give 12 decimal digits
|
||||
CK = 50
|
||||
a = [28., 0.5, 100., 5.0]
|
||||
|
||||
if k < CK:
|
||||
p = int(np.ceil(a[0]+a[1]*k-a[2]/(k+a[3])))
|
||||
|
||||
F = np.clip(von_mises_cdf_series(k,x,p),0,1)
|
||||
else:
|
||||
F = von_mises_cdf_normalapprox(k, x)
|
||||
|
||||
return F+ix
|
||||
Reference in New Issue
Block a user