# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# pyre-strict
from __future__ import annotations
import logging
import warnings
from typing import Any, Literal, NamedTuple, overload, TypedDict
import numpy as np
import numpy.typing as npt
import pandas as pd
logger: logging.Logger = logging.getLogger(__package__)
[docs]
class PropAboveBelowResult(TypedDict):
below: pd.Series | None
above: pd.Series | None
##########################################
# Weights diagnostics - functions for analyzing weights
# These functions provide statistical measures for evaluating
# the quality and distribution of survey weights
##########################################
def _weights_to_series(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
) -> pd.Series:
"""Normalize supported weight inputs to a pandas Series.
If ``w`` is a DataFrame, only the first column is used (the historical
behavior in this module).
Args:
w (list[Any] | pd.Series | npt.NDArray | pd.DataFrame):
Candidate weight container to normalize.
Returns:
pd.Series: Normalized weight values.
Raises:
TypeError: If ``w`` is a DataFrame with zero columns.
"""
if isinstance(w, pd.DataFrame):
if w.shape[1] == 0:
raise TypeError("weights (w) DataFrame must include at least one column.")
return w.iloc[:, 0]
if isinstance(w, pd.Series):
return w
return pd.Series(w)
def _check_weights_series_are_valid(
w: pd.Series,
*,
require_positive: bool = False,
) -> None:
"""Validate a normalized weight Series.
Args:
w (pd.Series): Weights represented as a pandas Series.
require_positive (bool, optional): If True, raise ``ValueError`` when
all weights are zero (or non-positive). If False (default), emit a
``UserWarning`` instead so the silent-NaN failure mode of
downstream weighted statistics (e.g. ``descriptive_stats``,
``asmd``) is visible to the caller. Defaults to False.
Raises:
ValueError: If ``w`` is empty (zero-length).
TypeError: If ``w`` is not numeric.
ValueError: If ``w`` includes any negative value.
ValueError: If ``require_positive`` is True and all weights are zero.
Warns:
UserWarning: If ``require_positive`` is False and the input has no
positive entries. The warning text distinguishes two causes:
``"All N weight entries are NaN"`` when every entry is missing,
and ``"All weights are zero (no positive entries)"`` for the
all-zero / mixed zero+NaN case. Both shapes silently produce
``NaN`` / ``inf`` downstream (``sum(w*x)/sum(w) = 0/0``); the
warning surfaces the failure mode without changing the
historical not-a-raise behaviour for the
``require_positive=False`` callers.
"""
if len(w) == 0:
# Empty inputs are pandas-version-unstable: ``pd.Series([])`` defaults
# to ``object`` dtype on older pandas (which would fall into the
# ``is_numeric_dtype`` raise) and to ``float64`` on newer pandas
# (which would silently fall through to the all-zero branch and
# emit a ``UserWarning``). Reject empty inputs deterministically up
# front so the contract is stable across supported pandas versions.
raise ValueError("weights (w) must be non-empty.")
if not pd.api.types.is_numeric_dtype(w):
raise TypeError(
f"weights (w) must be a number but instead they are of type: {w.dtype}."
)
# Use pandas ``.any()`` (not Python's built-in ``any()``) so the
# comparison is robust under nullable dtypes: ``Float64`` / ``Int64``
# produce ``pd.NA`` from comparisons with ``NA`` entries, and
# ``bool(pd.NA)`` raises ``TypeError: boolean value of NA is
# ambiguous`` -- which would crash ``any(w < 0)`` before any of the
# validation messages could fire. ``Series.any()`` defaults to
# ``skipna=True`` and treats NA as False for the boolean reduction.
if (w < 0).any():
raise ValueError("weights (w) must all be non-negative values.")
if not (w > 0).any():
if require_positive:
raise ValueError("weights (w) must include at least one positive value.")
# Distinguish all-NaN from all-zero so the UserWarning is not
# misleading: ``not (w > 0).any()`` fires for both ``[0, 0, 0]``
# and ``[NaN, NaN]``, and the previous "All weights are zero"
# wording incorrectly described the NaN case.
n_total: int = len(w)
n_nan: int = int(w.isna().sum())
if n_nan == n_total:
warnings.warn(
f"All {n_total} weight entries are NaN; weighted statistics "
"downstream will yield NaN. This usually indicates a bug in "
"the upstream weighting / filtering pipeline (e.g. an empty "
"join key or an all-missing column).",
UserWarning,
stacklevel=3,
)
else:
warnings.warn(
"All weights are zero (no positive entries); weighted "
"statistics downstream will yield NaN or inf "
"(``sum(w*x)/sum(w) = 0/0``). This usually indicates a bug "
"in the upstream weighting / filtering pipeline.",
UserWarning,
stacklevel=3,
)
def _check_weights_are_valid(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame | None,
*,
require_positive: bool = False,
) -> None:
"""Check weights.
Args:
w (Union[ List, pd.Series, np.ndarray, pd.DataFrame, None, ]): input weights.
If w is pd.DataFrame then only the first column will be checked (assuming it is a column of weights).
If input is None, then the function returns None with no errors (since None is a valid weights input for various functions).
require_positive (bool, optional): If True, require at least one weight
to be strictly positive. Defaults to False.
Raises:
TypeError: if weights are not numeric, or if ``w`` is an empty DataFrame.
ValueError: if weights include a negative value.
ValueError: if ``require_positive`` is True and all weights are zero.
Returns:
_type_: None
"""
if w is None:
return None
w_series = _weights_to_series(w)
_check_weights_series_are_valid(w_series, require_positive=require_positive)
return None
[docs]
def design_effect(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
) -> np.float64:
"""
Kish's design effect measure.
The design effect is a number that shows how well a sample of people may represent
a larger group of people for a specific measure of interest (such as the mean).
Kish's design effect gives the increase in the variance of the weighted mean based on "haphazard" weights.
The inverse of the design effect is the effective sample size ratio.
Design effect in general can be lower than 1 for stratified sampling. However, when calculating
Kish's design effect for weights the design effect is always 1 or larger.
For details, see: Tal Galili (5 May 2024). "Design effect".
WikiJournal of Science 7 (1): 4. doi:10.15347/wjs/2024.004. Wikidata Q116768211.
ISSN 2470-6345. https://en.wikipedia.org/wiki/Design_effect
Args:
w (list[Any] | pd.Series | npt.NDArray | pd.DataFrame):
Weights container with non-negative numeric values. If ``w`` is a
DataFrame, only the first column is used.
Returns:
np.float64: An estimator saying by how much the variance of the mean is expected to increase, compared to a random sample mean,
due to application of the weights.
Examples:
::
from balance.stats_and_plots.weights_stats import design_effect
import pandas as pd
design_effect(pd.Series((0, 1, 2, 3)))
# output:
# 1.5555555555555556
design_effect(pd.Series((1, 1, 1000)))
# 2.9880418803112336
# As expected. With a single dominating weight - the Deff is almost equal to the sample size.
"""
w = _weights_to_series(w)
_check_weights_series_are_valid(w, require_positive=True)
from balance.util import _safe_divide_with_zero_handling
# Avoid divide by zero warning
return _safe_divide_with_zero_handling((w**2).mean(), w.mean() ** 2)
[docs]
class KishStats(NamedTuple):
"""Kish design-effect diagnostic bundle.
All three members share a single ``design_effect`` computation. The
identities are:
* ``deff = E[w^2] / E[w]^2`` (>= 1 for non-degenerate weights)
* ``ess = n / deff`` (effective sample size)
* ``essp = 1 / deff`` (effective sample proportion in ``[0, 1]``)
Attributes:
deff: Kish's design effect.
ess: Kish's effective sample size.
essp: Kish's effective sample proportion.
"""
deff: float
ess: float
essp: float
[docs]
def kish_deff_stats(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
) -> KishStats:
"""Bundle Kish's design effect, effective sample size, and ESS proportion.
Computes ``design_effect`` once and derives ``ess`` and ``essp`` from
it, avoiding three separate Deff computations when all three quantities
are needed. Use ``kish_ess`` / ``kish_essp`` only when you need exactly
one number.
Args:
w (list[Any] | pd.Series | npt.NDArray | pd.DataFrame):
Weights container with non-negative numeric values. If ``w`` is a
DataFrame, only the first column is used.
Returns:
KishStats: Namedtuple with ``deff``, ``ess``, and ``essp`` fields.
Raises:
TypeError: If ``w`` is not numeric.
ValueError: If ``w`` is empty, contains negative entries, or contains
no positive entries (validated through ``design_effect`` with
``require_positive=True``).
Examples:
::
from balance.stats_and_plots.weights_stats import kish_deff_stats
import pandas as pd
stats = kish_deff_stats(pd.Series((1, 1, 1, 1)))
stats.deff # 1.0
stats.ess # 4.0
stats.essp # 1.0
"""
w_series = _weights_to_series(w)
deff = float(design_effect(w_series))
n = len(w_series)
return KishStats(deff=deff, ess=float(n / deff), essp=1.0 / deff)
[docs]
def kish_ess(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
) -> float:
"""Kish's effective sample size: ``n / Deff = sum(w)^2 / sum(w^2)``.
Convenience singleton over ``design_effect``. Prefer ``kish_deff_stats``
when you need ``ess`` alongside ``deff`` or ``essp`` — that path computes
``design_effect`` once and derives all three.
Args:
w (list[Any] | pd.Series | npt.NDArray | pd.DataFrame):
Weights container with non-negative numeric values. If ``w`` is a
DataFrame, only the first column is used.
Returns:
float: ESS in the same units as ``len(w)``. Equals ``len(w)`` when
all weights are equal.
Raises:
TypeError: If ``w`` is not numeric.
ValueError: If ``w`` is empty, contains negative entries, or contains
no positive entries.
Examples:
::
from balance.stats_and_plots.weights_stats import kish_ess
import pandas as pd
kish_ess(pd.Series((1, 1, 1, 1))) # 4.0
"""
w_series = _weights_to_series(w)
deff = float(design_effect(w_series))
return float(len(w_series) / deff)
[docs]
def kish_essp(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
) -> float:
"""Kish's effective sample proportion: ``1 / Deff`` (always in ``[0, 1]``).
Convenience singleton over ``design_effect``. Prefer ``kish_deff_stats``
when you need ``essp`` alongside ``deff`` or ``ess``.
Args:
w (list[Any] | pd.Series | npt.NDArray | pd.DataFrame):
Weights container with non-negative numeric values. If ``w`` is a
DataFrame, only the first column is used.
Returns:
float: A value in ``[0, 1]``. Equals ``1.0`` when all weights are
equal.
Raises:
TypeError: If ``w`` is not numeric.
ValueError: If ``w`` is empty, contains negative entries, or contains
no positive entries.
Examples:
::
from balance.stats_and_plots.weights_stats import kish_essp
import pandas as pd
kish_essp(pd.Series((1, 1, 1, 1))) # 1.0
"""
return 1.0 / float(design_effect(w))
[docs]
def nonparametric_skew(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
) -> float:
# TODO (p2): consider adding other skew measures (https://en.wikipedia.org/wiki/Skewness)
# look more in the literature (are there references for using this vs another, or none at all?)
# update the doc with insights, once done:
# what is more accepted in the literature in the field and what are the advantages of each.
# Any reference to literature where this is used to analyze weights of survey?
# Add reference to some interpretation of these values?
"""
The nonparametric skew is the difference between the mean and the median, divided by the standard deviation.
See:
- https://en.wikipedia.org/wiki/Nonparametric_skew
Args:
w (list[Any] | pd.Series | npt.NDArray | pd.DataFrame):
Weights container with non-negative numeric values. If ``w`` is a
DataFrame, only the first column is used.
Returns:
np.float64: A value of skew, between -1 to 1, but for weights it's often positive (i.e.: right tailed distribution).
The value returned will be 0 if the standard deviation is 0 (i.e.: all values are identical), or if the input is of length 1.
Examples:
::
from balance.stats_and_plots.weights_stats import nonparametric_skew
nonparametric_skew(pd.Series((1, 1, 1, 1))) # 0
nonparametric_skew(pd.Series((1))) # 0
nonparametric_skew(pd.Series((1, 2, 3, 4))) # 0
nonparametric_skew(pd.Series((1, 1, 1, 2))) # 0.5
nonparametric_skew(pd.Series((-1,1,1, 1))) #-0.5
"""
w = _weights_to_series(w)
_check_weights_series_are_valid(w, require_positive=True)
if (len(w) == 1) or (w.std() == 0):
return float(0)
return (w.mean() - w.median()) / w.std()
@overload
def prop_above_and_below(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
below: tuple[float, ...] | list[float] | None = (1 / 10, 1 / 5, 1 / 3, 1 / 2, 1),
above: tuple[float, ...] | list[float] | None = (1, 2, 3, 5, 10),
return_as_series: Literal[True] = True,
) -> pd.Series | None:
pass # pragma: no cover
@overload
def prop_above_and_below(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
below: tuple[float, ...] | list[float] | None = (1 / 10, 1 / 5, 1 / 3, 1 / 2, 1),
above: tuple[float, ...] | list[float] | None = (1, 2, 3, 5, 10),
*,
return_as_series: Literal[False],
) -> PropAboveBelowResult | None:
pass # pragma: no cover
[docs]
def prop_above_and_below(
w: list[Any] | pd.Series | npt.NDArray | pd.DataFrame,
below: tuple[float, ...] | list[float] | None = (1 / 10, 1 / 5, 1 / 3, 1 / 2, 1),
above: tuple[float, ...] | list[float] | None = (1, 2, 3, 5, 10),
return_as_series: bool = True,
) -> pd.Series | PropAboveBelowResult | None:
# TODO (p2): look more in the literature (are there references for using this vs another, or none at all?)
# update the doc with insights, once done.
"""
The proportion of weights, normalized to sample size, that are above and below some numbers (E.g. 1,2,3,5,10 and their inverse: 1, 1/2, 1/3, etc.).
This is similar to returning percentiles of the (normalized) weighted distribution. But instead of focusing on the 25th percentile, the median, etc,
We focus instead on more easily interpretable weights values.
For example, saying that some proportion of users had a weight of above 1 gives us an indication of how many users
we got that we don't "loose" their value after using the weights. Saying which proportion of users had a weight below 1/10 tells us how many users
had basically almost no contribution to the final analysis (after applying the weights).
Note that below and above can overlap, be unordered, etc. The user is responsible for the order.
Args:
w (list[Any] | pd.Series | npt.NDArray | pd.DataFrame):
Weights container with non-negative numeric values. If ``w`` is a
DataFrame, only the first column is used.
below (tuple[float, ...] | list[float] | None, optional):
values to check which proportion of normalized weights are *below* them.
Using None omits below-threshold calculations.
Defaults to (1/10, 1/5, 1/3, 1/2, 1).
above (tuple[float, ...] | list[float] | None, optional):
values to check which proportion of normalized weights are *above* (or equal) to them.
Using None omits above-threshold calculations.
Defaults to (1, 2, 3, 5, 10).
return_as_series (bool, optional): If true returns one pd.Series of values.
If False returns ``PropAboveBelowResult`` with ``below``/``above`` entries
containing a ``pd.Series`` or ``None`` for omitted groups.
Defaults to True.
Returns:
pd.Series | PropAboveBelowResult | None:
If return_as_series is True we get pd.Series with proportions of (normalized weights)
that are below/above some numbers, the index indicates which threshold was checked
(the values in the index are rounded up to 3 points for printing purposes).
If return_as_series is False we get ``PropAboveBelowResult`` with
``below`` and ``above`` keys whose values are the relevant pd.Series
(or ``None`` when a side is omitted). If both ``below`` and ``above``
are ``None``, the function returns ``None``.
Examples:
::
from balance.stats_and_plots.weights_stats import prop_above_and_below
import pandas as pd
# normalized weights:
print(pd.Series((1, 2, 3, 4)) / pd.Series((1, 2, 3, 4)).mean())
# 0 0.4
# 1 0.8
# 2 1.2
# 3 1.6
# checking the function:
prop_above_and_below(pd.Series((1, 2, 3, 4)))
# dtype: float64
# prop(w < 0.1) 0.00
# prop(w < 0.2) 0.00
# prop(w < 0.333) 0.00
# prop(w < 0.5) 0.25
# prop(w < 1.0) 0.50
# prop(w >= 1) 0.50
# prop(w >= 2) 0.00
# prop(w >= 3) 0.00
# prop(w >= 5) 0.00
# prop(w >= 10) 0.00
# dtype: float64
prop_above_and_below(pd.Series((1, 2, 3, 4)), below = (0.1, 0.5), above = (2,3))
# prop(w < 0.1) 0.00
# prop(w < 0.5) 0.25
# prop(w >= 2) 0.00
# prop(w >= 3) 0.00
# dtype: float64
prop_above_and_below(pd.Series((1, 2, 3, 4)), return_as_series = False)
# {'below': prop(w < 0.1) 0.00
# prop(w < 0.2) 0.00
# prop(w < 0.333) 0.00
# prop(w < 0.5) 0.25
# prop(w < 1) 0.50
# dtype: float64, 'above': prop(w >= 1) 0.5
# prop(w >= 2) 0.0
# prop(w >= 3) 0.0
# prop(w >= 5) 0.0
# prop(w >= 10) 0.0
# dtype: float64}
"""
w = _weights_to_series(w)
_check_weights_series_are_valid(w, require_positive=True)
# normalize weight to sample size:
w = w / w.mean()
if below is None and above is None:
return None
# calculate props from below:
if below is not None:
prop_below = [(w < i).mean() for i in below]
prop_below_index = ["prop(w < " + str(round(i, 3)) + ")" for i in below]
prop_below_series = pd.Series(prop_below, index=prop_below_index)
else:
prop_below_series = None
# calculate props from above:
if above is not None:
prop_above = [(w >= i).mean() for i in above]
prop_above_index = ["prop(w >= " + str(round(i, 3)) + ")" for i in above]
prop_above_series = pd.Series(prop_above, index=prop_above_index)
else:
prop_above_series = None
# decide if to return one series or a dict
if return_as_series:
pieces = [s for s in (prop_below_series, prop_above_series) if s is not None]
out = pd.concat(pieces)
else:
out = PropAboveBelowResult(below=prop_below_series, above=prop_above_series)
return out