Source code for balance.balancedf_class

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

from __future__ import annotations

import logging
import re
from typing import Any, Callable, get_args, Literal, Protocol, runtime_checkable

import numpy as np
import numpy.typing as npt
import pandas as pd
from balance import util as balance_util
from balance.csv_utils import to_csv_with_defaults
from balance.stats_and_plots import (
    general_stats,
    impact_of_weights_on_outcome,
    love_plot as _love_plot_module,
    weighted_comparisons_plots,
    weighted_comparisons_stats,
    weighted_stats,
    weights_stats,
)
from balance.stats_and_plots.weighted_comparisons_stats import (
    outcome_variance_ratio as _outcome_variance_ratio,
)
from balance.typing import FilePathOrBuffer
from balance.util import find_items_index_in_list, get_items_from_list_via_indices
from balance.utils.input_validation import _assert_type
from IPython.lib.display import FileLink
from plotly.graph_objs import Figure

logger: "logging.Logger" = logging.getLogger(__package__)


[docs] @runtime_checkable class BalanceDFSource(Protocol): """Protocol for objects that can back a BalanceDF. Any object satisfying this protocol can be passed to BalanceDF, BalanceDFCovars, BalanceDFWeights, and BalanceDFOutcomes constructors. Both ``Sample`` and ``SampleFrame`` implement this protocol, allowing BalanceDF to work with either backing class without modification. The seven members below are the complete set of attributes and methods that BalanceDF and its subclasses access on the backing object (``self._sample``). They were identified by auditing every ``self._sample.*`` access in ``balancedf_class.py``. Attributes: weight_series: Active weight column as a ``pd.Series``. id_series: Row identifier column as a ``pd.Series``. _links: Dict mapping relationship names (e.g. ``"target"``, ``"unadjusted"``) to other ``BalanceDFSource`` instances. _outcome_columns: Outcome DataFrame, or ``None`` if no outcomes. Methods: _covar_columns: Return the covariate DataFrame. set_weights: Replace the active weight column. trim: Trim extreme weights. """ @property def weight_series(self) -> pd.Series: # noqa: E704 ... @property def id_series(self) -> pd.Series: # noqa: E704 ... @property def _links(self) -> dict[str, BalanceDFSource]: # noqa: E704 ...
[docs] def _covar_columns(self) -> pd.DataFrame: # noqa: E704 ...
@property def _outcome_columns(self) -> pd.DataFrame | None: # noqa: E704 ...
[docs] def set_weights( # noqa: E704
self, weights: pd.Series | float | None, *, use_index: bool = False ) -> None: ...
[docs] def trim( # noqa: E704
self, ratio: float | int | None = None, percentile: float | tuple[float, float] | None = None, keep_sum_of_weights: bool = True, target_sum_weights: float | int | np.floating | None = None, *, inplace: bool = False, ) -> "BalanceDFSource": ...
[docs] class BalanceDF: """ Wrapper class around a BalanceDFSource which provides additional balance-specific functionality. This class encapsulates a pandas DataFrame along with a BalanceDFSource-compatible object (e.g. Sample, SampleFrame), providing methods for statistical analysis, plotting, and data transformation. """ _model_matrix: Any = None def __init__( self: "BalanceDF", df: pd.DataFrame, sample: BalanceDFSource, name: Literal["outcomes", "weights", "covars"], links: dict[str, "BalanceDFSource"] | None = None, ) -> None: """A basic init method used by BalanceDFOutcomes,BalanceDFCovars, and BalanceDFWeights Args: self (BalanceDF): The object that is initiated. df (pd.DataFrame): a df from a sample object. sample (BalanceDFSource): A BalanceDFSource-compatible object (e.g. Sample, SampleFrame) to be stored as reference. name (Literal["outcomes", "weights", "covars"]): The type of object that will be created. In practice, used for "outcomes", "weights" and "covars". links (dict[str, BalanceDFSource] | None): Optional explicit links dict (e.g. {"target": ..., "unadjusted": ...}). When provided, _balancedf_child_from_linked_samples uses this instead of sample._links. This allows BalanceDF to work with sources that do not carry mutable _links (e.g. SampleFrame). """ # NOTE: double underscore helps to add friction so that users do not change these objects. # see details here: https://stackoverflow.com/a/1301369/256662 # TODO: when refactoring the object class model, re-evaluate if we want to keep such objects protected or not. self.__sample = sample self.__df = df self.__name = name self.__links_override = links def __str__(self: "BalanceDF") -> str: name = self.__name sample = object.__repr__(self._sample) df = self.df.__repr__() return f"{name} from {sample}:\n{df}" def __repr__(self: "BalanceDF") -> str: return ( f"({self.__class__.__module__}.{self.__class__.__qualname__})\n" f"{self.__str__()}" ) # Private API @staticmethod def _check_if_not_balancedf( BalanceDF_class_obj: "BalanceDF", object_name: str = "sample_BalanceDF" ) -> None: """Check if an object is BalanceDF, if not then it raises ValueError Args: BalanceDF_class_obj (BalanceDF): Object to check. object_name (str, optional): Object name (to use when raising the ValueError). Defaults to "sample_BalanceDF". Returns: None. Raises: ValueError: if BalanceDF_class_obj is not BalanceDF object. """ if not isinstance(BalanceDF_class_obj, BalanceDF): raise ValueError( f"{object_name} must be balancedf_class.BalanceDF, is {type(BalanceDF_class_obj)}" ) @property def _sample(self: "BalanceDF") -> BalanceDFSource: """Access __sample internal object. Args: self (BalanceDF): Object Returns: BalanceDFSource: __sample """ return self.__sample @property def _weights( self: "BalanceDF", ) -> pd.Series | None: """Access the weight_series in __sample. Args: self (BalanceDF): Object Returns: pd.Series | None: The weights (with no column name) """ w = self._sample.weight_series return w.rename(None) @property def _resolved_links(self: "BalanceDF") -> dict[str, "BalanceDFSource"]: """Return the effective links dict, preferring explicit override. Uses ``links`` passed at construction time (via ``__links_override``) when available, otherwise falls back to ``self._sample._links``. Subclasses and sibling methods (e.g. ``r_indicator``) should use this property instead of accessing ``_sample._links`` directly so that BalanceFrame-provided overrides are respected. """ if self.__links_override is not None: return self.__links_override return self._sample._links # NOTE: only in the case of BalanceDFOutcomes can it result in a None value. def _balancedf_child_from_linked_samples( self: "BalanceDF", ) -> dict[ str, "BalanceDF" | "BalanceDFCovars" | "BalanceDFWeights" | "BalanceDFOutcomes" | None, ]: """Returns a dict with self and the same type of BalanceDF_child when created from the linked samples. For example, if this function is called from a BalanceDFCovars (originally created using `Sample.covars()`), that was invoked by a Sample with a target then the return dict will have the keys 'self' and 'target', with the BalanceDFCovars of the self and that of the target. If the object has nothing but self, then it will be a dict with only one key:value pair (of self). Args: self (BalanceDF): Object (used in practice only with children of BalanceDF). Returns: dict[str, BalanceDFCovars | BalanceDFWeights | BalanceDFOutcomes | None]: A dict mapping the link relationship to the result. First item is self, and it just returns it without using method on it. The other items are based on the objects in _links. E.g.: it can be 'target' and 'unadjusted', and it will return them after running the same BalanceDF child creation method on them. Examples: .. code-block:: python from balance.sample_class import Sample import pandas as pd s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") # keys depends on which samples are linked to the object: list(s1.covars()._balancedf_child_from_linked_samples().keys()) # ['self'] list(s3.covars()._balancedf_child_from_linked_samples().keys()) # ['self', 'target'] list(s3_null.covars()._balancedf_child_from_linked_samples().keys()) # ['self', 'target', 'unadjusted'] # Indeed, all are of the same BalanceDF child type: s3.covars()._balancedf_child_from_linked_samples() # {'self': (balance.balancedf_class.BalanceDFCovars) # covars from <balance.sample_class.Sample object at 0x7f4392ea61c0>: # a b c # 0 1 -42 x # 1 2 8 y # 2 3 2 z # 3 1 -42 v, # 'target': (balance.balancedf_class.BalanceDFCovars) # covars from <balance.sample_class.Sample object at 0x7f43958fbd90>: # a b c # 0 1 4 x # 1 2 6 y # 2 3 8 z} s3_null.covars()._balancedf_child_from_linked_samples() # {'self': (balance.balancedf_class.BalanceDFCovars) # covars from <balance.sample_class.Sample object at 0x7f4392ea60d0>: # a b c # 0 1 -42 x # 1 2 8 y # 2 3 2 z # 3 1 -42 v, # 'target': (balance.balancedf_class.BalanceDFCovars) # covars from <balance.sample_class.Sample object at 0x7f43958fbd90>: # a b c # 0 1 4 x # 1 2 6 y # 2 3 8 z, # 'unadjusted': (balance.balancedf_class.BalanceDFCovars) # covars from <balance.sample_class.Sample object at 0x7f4392ea61c0>: # a b c # 0 1 -42 x # 1 2 8 y # 2 3 2 z # 3 1 -42 v} the_dict = s3_null.covars()._balancedf_child_from_linked_samples() [v.__class__ for (k,v) in the_dict.items()] [balance.balancedf_class.BalanceDFCovars, balance.balancedf_class.BalanceDFCovars, balance.balancedf_class.BalanceDFCovars] # This also works for outcomes (returns None if there is none): s3.outcomes()._balancedf_child_from_linked_samples() # {'self': (balance.balancedf_class.BalanceDFOutcomes) # outcomes from <balance.sample_class.Sample object at 0x7f4392ea61c0>: # o # 0 7 # 1 8 # 2 9 # 3 10, # 'target': None} # And also works for weights: s3.weights()._balancedf_child_from_linked_samples() # {'self': (balance.balancedf_class.BalanceDFWeights) # weights from <balance.sample_class.Sample object at 0x7f4392ea61c0>: # w # 0 0.5 # 1 2.0 # 2 1.0 # 3 1.0, # 'target': (balance.balancedf_class.BalanceDFWeights) # weights from <balance.sample_class.Sample object at 0x7f43958fbd90>: # w # 0 0.5 # 1 1.0 # 2 2.0} """ # NOTE: this assumes that the .__name is the same as the creation method (i.e.: .covars(), .weights(), .outcomes()) BalanceDF_child_method = self.__name d: dict[ str, "BalanceDF" | "BalanceDFCovars" | "BalanceDFWeights" | "BalanceDFOutcomes" | None, ] = {"self": self} # Use explicit links if provided, otherwise fall back to sample._links links = self._resolved_links linked_child_kwargs = self._linked_child_kwargs() d.update( { k: getattr(v, BalanceDF_child_method)(**linked_child_kwargs) for k, v in links.items() } ) return d def _linked_child_kwargs(self: "BalanceDF") -> dict[str, Any]: """Keyword arguments used when creating linked BalanceDF children. Subclasses can override this to preserve construction options across linked samples (for example, formula settings for covariates). """ return {} def _call_on_linked( self: "BalanceDF", method: str, exclude: tuple[str, ...] = (), *args: Any, **kwargs: Any, ) -> pd.DataFrame: """Call a given method on the linked DFs of the BalanceDF object. Returns the result as a pandas DataFrame where the source column indicates where the result came from Args: self (BalanceDF): Object. method (str): A name of a method to call (e.g.: "mean", "std", etc.). Can also be a name of an attribute that is a DataFrame (e.g.: 'df') exclude (tuple[str], optional): A tuple of strings which indicates which source should be excluded from the output. Defaults to (). E.g.: "self", "target". Returns: pd.DataFrame: A pandas DataFrame where the source column indicates where the result came from. E.g.: 'self', 'target', 'unadjusted'. And the columns are based on the method called. E.g.: using mean will give the per column mean, after applying `model_matrix` to the df from each object. Examples: .. code-block:: python from balance.sample_class import Sample import pandas as pd s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) print(s3.covars()._call_on_linked("mean").round(3)) # a b c[v] c[x] c[y] c[z] # source # self 1.889 -10.000 0.222 0.111 0.444 0.222 # target 2.429 6.857 NaN 0.143 0.286 0.571 print(s3.covars()._call_on_linked("df").round(3)) # a b c # source # self 1 -42 x # self 2 8 y # self 3 2 z # self 1 -42 v # target 1 4 x # target 2 6 y # target 3 8 z """ output = [] for k, v in self._balancedf_child_from_linked_samples().items(): if v is not None and k not in exclude: v_att_method = getattr(v, method) if callable(v_att_method): v_att_method = v_att_method( *args, on_linked_samples=False, **kwargs ) output.append(v_att_method.assign(source=k).set_index("source")) # pyrefly: ignore [bad-return] return pd.concat(output) # return pd.concat( # getattr(v, method)(on_linked_samples=False, *args, **kwargs) # .assign(source=k) # .set_index("source") # if callable(getattr(v, method)) # else getattr(v, method)(on_linked_samples=False, *args, **kwargs) # .assign(source=k) # .set_index("source") # for k, v in self._balancedf_child_from_linked_samples().items() # if v is not None and k not in exclude # )
[docs] def model_matrix( self: "BalanceDF", formula: str | list[str] | None = None ) -> pd.DataFrame: """Return a model_matrix version of the df inside the BalanceDF object using balance_util.model_matrix This can be used to turn all character columns into a one hot encoding columns. Args: self (BalanceDF): Object formula (str | list[str] | None, optional): Optional formula string (or list of formula strings) to pass to :func:`balance_util.model_matrix`. When provided, the model matrix is computed on demand for the formula and not cached on the object. Defaults to None. Returns: pd.DataFrame: The output from :func:`balance_util.model_matrix` Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) print(s1.covars().df) # a b c # 0 1 -42 x # 1 2 8 y # 2 3 2 z # 3 1 -42 v print(s1.covars().model_matrix()) # a b c[v] c[x] c[y] c[z] # 0 1.0 -42.0 0.0 1.0 0.0 0.0 # 1 2.0 8.0 0.0 0.0 1.0 0.0 # 2 3.0 2.0 0.0 0.0 0.0 1.0 # 3 1.0 -42.0 1.0 0.0 0.0 0.0 print(s1.covars().model_matrix(formula="a + b")) # a b # 0 1.0 -42.0 # 1 2.0 8.0 # 2 3.0 2.0 # 3 1.0 -42.0 """ if formula is None: if not hasattr(self, "_model_matrix") or self._model_matrix is None: self._model_matrix = balance_util.model_matrix( self.df, add_na=True, return_type="one" )["model_matrix"] # pyrefly: ignore [bad-return] return self._model_matrix return _assert_type( balance_util.model_matrix( self.df, add_na=True, return_type="one", formula=formula )["model_matrix"], pd.DataFrame, )
def _descriptive_stats( self: "BalanceDF", stat: Literal["mean", "std", "var_of_mean", "ci_of_mean", "..."] = "mean", weighted: bool = True, numeric_only: bool = False, add_na: bool = True, **kwargs: Any, ) -> pd.DataFrame: """ Calls a given method from :func:`weighted_stats.descriptive_stats` on 'self'. This function knows how to extract the df and the weights from a BalanceDF object. Args: self (BalanceDF): An object to run stats on. stat (Literal["mean", "std", "var_of_mean", "ci_of_mean", "..."], optional): Defaults to "mean". weighted (bool, optional): Defaults to True. numeric_only (bool, optional): Defaults to False. add_na (bool, optional): Defaults to True. **kwargs: extra args to pass to descriptive_stats Returns: pd.DataFrame: Returns pd.DataFrame of the output (based on stat argument), for each of the columns in df. """ if numeric_only: df = self.df.select_dtypes(include=[np.number]) else: df = self.model_matrix() weights = ( self._weights.values if (weighted and self._weights is not None) else None ) wdf = weighted_stats.descriptive_stats( df, weights, stat, weighted=weighted, # Using numeric_only=True since we know that df is screened in this function # To only include numeric variables. So this saves descriptive_stats from # running model_matrix again. numeric_only=True, add_na=add_na, **kwargs, ) return wdf
[docs] def to_download(self: "BalanceDF", tempdir: str | None = None) -> FileLink: """Creates a downloadable link of the DataFrame, with ids, of the BalanceDF object. File name starts with tmp_balance_out_, and some random file name (using :func:`uuid.uuid4`). Args: self (BalanceDF): Object. tempdir (Optional[str], optional): Defaults to None (which then uses a temporary folder using :func:`tempfile.gettempdir`). Returns: FileLink: Embedding a local file link in an IPython session, based on path. Using :func:FileLink. Examples: .. code-block:: python import pandas as pd import tempfile from IPython.lib.display import FileLink from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( {"id": ["1", "2"], "x": [0, 1], "weight": [1.0, 2.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) isinstance(sample.covars().to_download(tempdir=tempfile.gettempdir()), FileLink) # True """ return balance_util._to_download(self._df_with_ids(), tempdir)
# Public API # All these functions operate on multiple samples @property def df(self: "BalanceDF") -> pd.DataFrame: """ Get the df of the BalanceDF object. The df is stored in the BalanceDF.__df object, that is set during the __init__ of the object. Args: self (BalanceDF): The object. Returns: pd.DataFrame: The df (this is __df, with no weights) from the BalanceDF object. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( {"id": ["1", "2"], "x": [0, 1], "weight": [1.0, 2.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) sample.covars().df.columns.tolist() # ['x'] """ return self.__df
[docs] def names(self: "BalanceDF") -> list[str]: """Returns the column names of the DataFrame (df) inside a BalanceDF object. Args: self (BalanceDF): The object. Returns: List: Of column names. Examples: .. code-block:: python s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s1.covars().names() # ['a', 'b', 'c'] s1.weights().names() # ['w'] s1.outcomes().names() # ['o'] """ return list(self.df.columns.values)
[docs] def plot( self: "BalanceDF", on_linked_samples: bool = True, **kwargs: Any, ) -> list[Any] | npt.NDArray[Any] | dict[str, Figure] | str | None: """Plots the variables in the df of the BalanceDF object. See :func:`weighted_comparisons_plots.plot_dist` for details of various arguments that can be passed. The default plotting engine is plotly, but seaborn can be used for static plots, or "balance" can be used for ASCII text output suitable for LLM consumption (only dist_type="hist_ascii" is supported with library="balance"). When using ``library="balance"``, numeric variables are rendered as comparative histograms by default (showing excess/deficit vs. a baseline). Pass ``comparative=False`` to use grouped-bar histograms instead (same style as categorical variables). This function is inherited as is when invoking BalanceDFCovars.plot, but some modifications are made when preparing the data for BalanceDFOutcomes.plot and BalanceDFWeights.plot. Args: self (BalanceDF): Object (used in the plots as "sample" or "self") on_linked_samples (bool, optional): Determines if the linked samples should be included in the plot. Defaults to True. **kwargs: passed to :func:`weighted_comparisons_plots.plot_dist`. Returns: list | np.ndarray | dict[str, Figure] | str | None: If library="plotly" then returns a dictionary containing plots if return_dict_of_figures is True. None otherwise. If library="seaborn" then returns None, unless return_axes is True. Then either a list or an np.array of matplotlib axis. If library="balance" then returns a string with the ASCII text output. Examples: .. code-block:: python import numpy as np import pandas as pd from numpy import random from balance.sample_class import Sample random.seed(96483) df = pd.DataFrame({ "id": range(100), 'v1': random.random_integers(11111, 11114, size=100).astype(str), 'v2': random.normal(size = 100), 'v3': random.uniform(size = 100), "w": pd.Series(np.ones(99).tolist() + [1000]), }).sort_values(by=['v2']) s1 = Sample.from_frame(df, id_column="id", weight_column="w", ) s2 = Sample.from_frame( df.assign(w = pd.Series(np.ones(100))), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") s3_null.set_weights(random.random(size = 100) + 0.5) s3_null.covars().plot() s3_null.covars().plot(library = "seaborn") # Controlling the limits of the y axis using lim: s3_null.covars().plot(ylim = (0,1)) s3_null.covars().plot(library = "seaborn",ylim = (0,1), dist_type = "hist") # Returning plotly qq plots: s3_null.covars().plot(dist_type = "qq") # ASCII text output (suitable for LLM consumption): s3_null.covars().plot(library = "balance", dist_type = "hist_ascii") # ASCII with grouped-bar histograms instead of comparative: s3_null.covars().plot(library = "balance", comparative = False) """ if on_linked_samples: dfs_to_add = self._balancedf_child_from_linked_samples() else: dfs_to_add = {"self": self} # Create a list of dicts, each dict representing a dataframe and weights # Notice that we skip cases in which there is no data (i.e.: v is None) # None values are skipped in both dfs and names dfs = [ {"df": v.df, "weight": v._weights} for k, v in dfs_to_add.items() if (v is not None) ] names = [k for k, v in dfs_to_add.items() if (v is not None)] # re-order dfs and names # NOTE: "target", if exists, is placed at the end of the dict so that comparative plotting functions, # can use it as the reference distribution for comparison. indices_of_ordered_names = find_items_index_in_list( names, ["unadjusted", "self", "adjusted", "target"] ) dfs = get_items_from_list_via_indices(dfs, indices_of_ordered_names) names = get_items_from_list_via_indices(names, indices_of_ordered_names) return weighted_comparisons_plots.plot_dist(dfs, names=names, **kwargs)
# NOTE: The following functions use the _call_on_linked method # to return information about the characteristics of linked Samples
[docs] def mean( self: "BalanceDF", on_linked_samples: bool = True, **kwargs: Any ) -> pd.DataFrame: """Calculates a weighted mean on the df of the BalanceDF object. Args: self (BalanceDF): Object. on_linked_samples (bool, optional): Should the calculation be on self AND the linked samples objects? Defaults to True. If True, then uses :func:`_call_on_linked` with method "mean". If False, then uses :func:`_descriptive_stats` with method "mean". Returns: pd.DataFrame: With row per object: self if on_linked_samples=False, and self and others (e.g.: target and unadjusted) if True. Columns are for each of the columns in the relevant df (after applying :func:`model_matrix`) Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") print(s3_null.covars().mean()) # a b c[v] c[x] c[y] c[z] # source # self 1.888889 -10.000000 0.222222 0.111111 0.444444 0.222222 # target 2.428571 6.857143 NaN 0.142857 0.285714 0.571429 # unadjusted 1.888889 -10.000000 0.222222 0.111111 0.444444 0.222222 """ if on_linked_samples: return self._call_on_linked("mean", **kwargs) else: return self._descriptive_stats("mean", **kwargs)
[docs] def std( self: "BalanceDF", on_linked_samples: bool = True, **kwargs: Any ) -> pd.DataFrame: """Calculates a weighted std on the df of the BalanceDF object. Args: self (BalanceDF): Object. on_linked_samples (bool, optional): Should the calculation be on self AND the linked samples objects? Defaults to True. If True, then uses :func:`_call_on_linked` with method "std". If False, then uses :func:`_descriptive_stats` with method "std". Returns: pd.DataFrame: With row per object: self if on_linked_samples=False, and self and others (e.g.: target and unadjusted) if True. Columns are for each of the columns in the relevant df (after applying :func:`model_matrix`) Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") print(s3_null.covars().std()) # a b c[v] c[x] c[y] c[z] # source # self 0.886405 27.354812 0.5 0.377964 0.597614 0.500000 # target 0.963624 1.927248 NaN 0.462910 0.597614 0.654654 # unadjusted 0.886405 27.354812 0.5 0.377964 0.597614 0.500000 """ if on_linked_samples: return self._call_on_linked("std", **kwargs) else: return self._descriptive_stats("std", **kwargs)
[docs] def var_of_mean( self: "BalanceDF", on_linked_samples: bool = True, **kwargs: Any ) -> pd.DataFrame: """Calculates a variance of the weighted mean on the df of the BalanceDF object. Args: self (BalanceDF): Object. on_linked_samples (bool, optional): Should the calculation be on self AND the linked samples objects? Defaults to True. If True, then uses :func:`_call_on_linked` with method "var_of_mean". If False, then uses :func:`_descriptive_stats` with method "var_of_mean". Returns: pd.DataFrame: With row per object: self if on_linked_samples=False, and self and others (e.g.: target and unadjusted) if True. Columns are for each of the columns in the relevant df (after applying :func:`model_matrix`) Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample from balance.stats_and_plots.weighted_stats import var_of_weighted_mean var_of_weighted_mean(pd.Series((1, 2, 3, 1)), pd.Series((0.5, 2, 1, 1))) # 0 0.112178 # dtype: float64 # This shows we got the first cell of 'a' as expected. s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") print(s3_null.covars().var_of_mean()) # a b c[v] c[x] c[y] c[z] # source # self 0.112178 134.320988 0.042676 0.013413 0.082914 0.042676 # target 0.163265 0.653061 NaN 0.023324 0.069971 0.093294 # unadjusted 0.112178 134.320988 0.042676 0.013413 0.082914 0.042676 """ if on_linked_samples: return self._call_on_linked("var_of_mean", **kwargs) else: return self._descriptive_stats("var_of_mean", **kwargs)
[docs] def ci_of_mean( self: "BalanceDF", on_linked_samples: bool = True, **kwargs: Any ) -> pd.DataFrame: """Calculates a confidence intervals of the weighted mean on the df of the BalanceDF object. Args: self (BalanceDF): Object. on_linked_samples (bool, optional): Should the calculation be on self AND the linked samples objects? Defaults to True. If True, then uses :func:`_call_on_linked` with method "ci_of_mean". If False, then uses :func:`_descriptive_stats` with method "ci_of_mean". kwargs: we can pass ci_of_mean arguments. E.g.: conf_level and round_ndigits. Returns: pd.DataFrame: With row per object: self if on_linked_samples=False, and self and others (e.g.: target and unadjusted) if True. Columns are for each of the columns in the relevant df (after applying :func:`model_matrix`) Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample from balance.stats_and_plots.weighted_stats import ci_of_weighted_mean ci_of_weighted_mean(pd.Series((1, 2, 3, 1)), pd.Series((0.5, 2, 1, 1)), round_ndigits = 3) # 0 (1.232, 2.545) # dtype: object # This shows we got the first cell of 'a' as expected. s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") print(s3_null.covars().ci_of_mean(round_ndigits = 3).T) # source self target unadjusted # a (1.232, 2.545) (1.637, 3.221) (1.232, 2.545) # b (-32.715, 12.715) (5.273, 8.441) (-32.715, 12.715) # c[v] (-0.183, 0.627) NaN (-0.183, 0.627) # c[x] (-0.116, 0.338) (-0.156, 0.442) (-0.116, 0.338) # c[y] (-0.12, 1.009) (-0.233, 0.804) (-0.12, 1.009) # c[z] (-0.183, 0.627) (-0.027, 1.17) (-0.183, 0.627) s3_2 = s1.set_target(s2) s3_null_2 = s3_2.adjust(method="null") print(s3_null_2.outcomes().ci_of_mean(round_ndigits = 3)) # o # source # self (7.671, 9.44) # unadjusted (7.671, 9.44) """ if on_linked_samples: return self._call_on_linked("ci_of_mean", **kwargs) else: return self._descriptive_stats("ci_of_mean", **kwargs)
[docs] def mean_with_ci( self: "BalanceDF", round_ndigits: int = 3, on_linked_samples: bool = True ) -> pd.DataFrame: """ Returns a table with means and confidence intervals (CIs) for all elements in the BalanceDF object. This method calculates the mean and CI for each column of the BalanceDF object using the BalanceDF.mean() and BalanceDF.ci_of_mean() methods, respectively. The resulting table contains (for each element such as self, target and adjust) two columns for each input column: one for the mean and one for the CI. Args: self (BalanceDF): The BalanceDF object. round_ndigits (int, optional): The number of decimal places to round the mean and CI to. Defaults to 3. on_linked_samples (bool, optional): A boolean indicating whether to include linked samples when calculating the mean. Defaults to True. Returns: pd.DataFrame: A table with two rows for each input column: one for the mean and one for the CI. The columns of the table are labeled with the names of the input columns. Examples: .. code-block:: python import numpy as np import pandas as pd from balance.sample_class import Sample s_o = Sample.from_frame( pd.DataFrame({"o1": (7, 8, 9, 10), "o2": (7, 8, 9, np.nan), "id": (1, 2, 3, 4)}), id_column="id", outcome_columns=("o1", "o2"), ) t_o = Sample.from_frame( pd.DataFrame( { "o1": (7, 8, 9, 10, 11, 12, 13, 14), "o2": (7, 8, 9, np.nan, np.nan, 12, 13, 14), "id": (1, 2, 3, 4, 5, 6, 7, 8), } ), id_column="id", outcome_columns=("o1", "o2"), ) s_o2 = s_o.set_target(t_o) print(s_o2.outcomes().mean_with_ci()) # source self target self target # _is_na_o2[False] 0.75 0.750 (0.326, 1.174) (0.45, 1.05) # _is_na_o2[True] 0.25 0.250 (-0.174, 0.674) (-0.05, 0.55) # o1 8.50 10.500 (7.404, 9.596) (8.912, 12.088) # o2 6.00 7.875 (2.535, 9.465) (4.351, 11.399) """ the_means = ( self.mean(on_linked_samples=on_linked_samples).round(round_ndigits).T ) the_cis = self.ci_of_mean( on_linked_samples=on_linked_samples, round_ndigits=round_ndigits ).T the_cis.columns = the_cis.columns.astype(str) + "_ci" return pd.concat([the_means, the_cis], axis=1)
# NOTE: Summary could return also an str in case it is overridden in other children's methods.
[docs] def summary( self: "BalanceDF", on_linked_samples: bool = True ) -> pd.DataFrame | str: """ Returns a summary of the BalanceDF object. This method currently calculates the mean and confidence interval (CI) for each column of the object using the :func:`BalanceDF.mean_with_ci()` method. In the future, this method may be extended to include additional summary statistics. Args: self (BalanceDF): The BalanceDF object. on_linked_samples (bool, optional): A boolean indicating whether to include linked samples when calculating the mean and CI. Defaults to True. Returns: Union[pd.DataFrame, str]: A table with two rows for each input column: one for the mean and one for the CI. The columns of the table are labeled with the names of the input columns. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( {"id": ["1", "2"], "x": [0, 1], "weight": [1.0, 2.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) sample.covars().summary().columns.tolist() # ['self', 'self_ci'] """ # TODO model matrix means to include categorical columns, fix model_matrix to accept DataFrame # TODO: include min/max/std/etc. show min/mean/max if there's a single column, just means if multiple (covars and outcomes) # Doing so would either require to implement a min/max etc methods in BalanceDF and use them with _call_on_linked. # Or, update _call_on_linked to deal with non functions, get 'df' from it, and apply the needed functions on it. # TODO add outcome variance ratio return self.mean_with_ci(on_linked_samples=on_linked_samples)
def _get_df_and_weights( self: "BalanceDF", use_model_matrix: bool = True, ) -> tuple[pd.DataFrame, npt.NDArray | None]: """Extract df values and weights from a BalanceDF object. Args: self (BalanceDF): Object use_model_matrix (bool, optional): If True, use :func:`model_matrix`. If False, use the raw df. Defaults to True. Returns: tuple[pd.DataFrame, np.ndarray | None]: A pd.DataFrame output from running :func:`model_matrix` or using the raw df, and A np.ndarray of weights from :func:`_weights`, or just None (if there are no weights). """ if use_model_matrix: df_values = self.model_matrix() else: df_values = self.df.copy() # get weights (like in BalanceDF._descriptive_stats) weights = self._weights.values if (self._weights is not None) else None return df_values, weights @staticmethod def _apply_comparison_stat_to_BalanceDF( comparison_func: Callable[..., pd.Series], sample_BalanceDF: "BalanceDF", target_BalanceDF: "BalanceDF", aggregate_by_main_covar: bool = False, use_model_matrix: bool = True, **kwargs: Any, ) -> pd.Series: """Generic helper to apply a weighted comparison statistic function to two BalanceDF objects. This helper function reduces code duplication across multiple comparison methods (asmd, kld, emd, cvmd, ks) by extracting the common pattern of: 1. Validating inputs are BalanceDF objects 2. Extracting df and weights from both objects 3. Calling the comparison function with the extracted data When ``use_model_matrix`` is False, the raw DataFrames are combined, NA indicator columns are added via :func:`balance.util.add_na_indicator_to_combined`, and the result is split back so that both frames share consistent indicator columns. Args: comparison_func (Callable[..., pd.Series]): The comparison function from weighted_comparisons_stats to apply (e.g., asmd, kld, emd, cvmd, ks). sample_BalanceDF (BalanceDF): Sample object. target_BalanceDF (BalanceDF): Target object. aggregate_by_main_covar (bool, optional): Whether to aggregate by main covariate. Defaults to False. Passed to the comparison function. use_model_matrix (bool, optional): If True, use :func:`model_matrix` on the BalanceDF objects. If False, use raw covariates with NA indicators. Defaults to True. **kwargs: Additional keyword arguments to pass to the comparison function (e.g., std_type for asmd). Returns: pd.Series: The result from the comparison function. """ BalanceDF._check_if_not_balancedf(sample_BalanceDF, "sample_BalanceDF") BalanceDF._check_if_not_balancedf(target_BalanceDF, "target_BalanceDF") sample_df_values, sample_weights = sample_BalanceDF._get_df_and_weights( use_model_matrix=use_model_matrix, ) target_df_values, target_weights = target_BalanceDF._get_df_and_weights( use_model_matrix=use_model_matrix, ) if not use_model_matrix: combined = balance_util.add_na_indicator_to_combined( pd.concat([sample_df_values, target_df_values], axis=0) ) sample_n = sample_df_values.shape[0] sample_df_values = combined.iloc[:sample_n].copy() target_df_values = combined.iloc[sample_n:].copy() return comparison_func( sample_df_values, target_df_values, sample_weights, target_weights, aggregate_by_main_covar=aggregate_by_main_covar, **kwargs, ) @staticmethod def _asmd_BalanceDF( sample_BalanceDF: "BalanceDF", target_BalanceDF: "BalanceDF", aggregate_by_main_covar: bool = False, ) -> pd.Series: """Run asmd on two BalanceDF objects Prepares the BalanceDF objects by passing them through :func:`_get_df_and_weights`, and then pass the df and weights from the two objects into :func:`weighted_comparisons_stats.asmd`. Note that this will works on the result of model_matrix (default behavior, no formula supplied), which is different than just the raw covars. E.g.: in case there are nulls (will produce an indicator column of that), as well as if there are categorical variables (transforming them using one hot encoding). Args: sample_df (BalanceDF): Object target_df (BalanceDF): Object aggregate_by_main_covar (bool, optional): See :func:`weighted_comparisons_stats.asmd`. Defaults to False. Returns: pd.Series: See :func:`weighted_comparisons_stats.asmd` Examples: .. code-block:: python from balance.balancedf_class import BalanceDF BalanceDF._asmd_BalanceDF( Sample.from_frame( pd.DataFrame( {"id": (1, 2), "a": (1, 2), "b": (-1, 12), "weight": (1, 2)} ) ).covars(), Sample.from_frame( pd.DataFrame( {"id": (1, 2), "a": (3, 4), "b": (0, 42), "weight": (1, 2)} ) ).covars(), ) # a 2.828427 # b 0.684659 # mean(asmd) 1.756543 # dtype: float64 """ return BalanceDF._apply_comparison_stat_to_BalanceDF( weighted_comparisons_stats.asmd, sample_BalanceDF, target_BalanceDF, aggregate_by_main_covar, std_type="target", ) @staticmethod def _kld_BalanceDF( sample_BalanceDF: "BalanceDF", target_BalanceDF: "BalanceDF", aggregate_by_main_covar: bool = False, ) -> pd.Series: """Run KLD on two BalanceDF objects. By default, this prepares the BalanceDF objects by using their raw df (with NA indicators), and then passes the df and weights from the two objects into :func:`weighted_comparisons_stats.kld`. If either BalanceDF provides a formula for KLD comparisons (currently :class:`BalanceDFCovars` with a stored formula), this method builds a *shared* model matrix from the combined sample+target data using that single effective formula and compares those aligned matrices instead of raw covariates. If both objects provide formulas and they differ, a ``ValueError`` is raised to prevent comparing mismatched design matrices. Args: sample_BalanceDF (BalanceDF): Object target_BalanceDF (BalanceDF): Object aggregate_by_main_covar (bool, optional): See :func:`weighted_comparisons_stats.kld`. Defaults to False. Returns: pd.Series: See :func:`weighted_comparisons_stats.kld`. """ BalanceDF._check_if_not_balancedf(sample_BalanceDF, "sample_BalanceDF") BalanceDF._check_if_not_balancedf(target_BalanceDF, "target_BalanceDF") sample_formula = sample_BalanceDF._kld_formula() target_formula = target_BalanceDF._kld_formula() if sample_formula is not None and target_formula is not None: normalized_sample_formula = BalanceDF._normalize_formula_for_comparison( sample_formula ) normalized_target_formula = BalanceDF._normalize_formula_for_comparison( target_formula ) if normalized_sample_formula != normalized_target_formula: raise ValueError( "KLD formula mismatch between sample and target. " f"Got sample formula {sample_formula!r} and target formula {target_formula!r}. " "Use a single shared formula for both." ) effective_formula = ( sample_formula if sample_formula is not None else target_formula ) use_model_matrix = effective_formula is not None if use_model_matrix: mm = balance_util.model_matrix( sample_BalanceDF.df, target_BalanceDF.df, add_na=True, return_type="two", formula=effective_formula, ) sample_weights = ( sample_BalanceDF._weights.values if sample_BalanceDF._weights is not None else None ) target_weights = ( target_BalanceDF._weights.values if target_BalanceDF._weights is not None else None ) return weighted_comparisons_stats.kld( _assert_type(mm["sample"], pd.DataFrame), _assert_type(mm["target"], pd.DataFrame), sample_weights, target_weights, aggregate_by_main_covar=aggregate_by_main_covar, ) return BalanceDF._apply_comparison_stat_to_BalanceDF( weighted_comparisons_stats.kld, sample_BalanceDF, target_BalanceDF, aggregate_by_main_covar, use_model_matrix=False, ) def _kld_formula(self: "BalanceDF") -> str | list[str] | None: """Formula to use for KLD comparison matrices, if applicable.""" return None @staticmethod def _normalize_formula_for_comparison( formula: str | list[str], ) -> tuple[str, ...]: """Normalize formulas for robust equality checks. Insignificant whitespace is removed so equivalent formulas such as ``\"a*b\"`` and ``\"a * b\"`` compare equal. """ if isinstance(formula, str): formula = [formula] return tuple(re.sub(r"\s+", "", f) for f in formula) @staticmethod def _emd_BalanceDF( sample_BalanceDF: "BalanceDF", target_BalanceDF: "BalanceDF", aggregate_by_main_covar: bool = False, ) -> pd.Series: """Run EMD on two BalanceDF objects. Prepares the BalanceDF objects by using their raw df (with NA indicators), and then passes the df and weights into :func:`weighted_comparisons_stats.emd`. Args: sample_BalanceDF (BalanceDF): Object. target_BalanceDF (BalanceDF): Object. aggregate_by_main_covar (bool, optional): See :func:`weighted_comparisons_stats.emd`. Defaults to False. Returns: pd.Series: See :func:`weighted_comparisons_stats.emd`. """ return BalanceDF._apply_comparison_stat_to_BalanceDF( weighted_comparisons_stats.emd, sample_BalanceDF, target_BalanceDF, aggregate_by_main_covar, use_model_matrix=False, ) @staticmethod def _cvmd_BalanceDF( sample_BalanceDF: "BalanceDF", target_BalanceDF: "BalanceDF", aggregate_by_main_covar: bool = False, ) -> pd.Series: """Run CVMD on two BalanceDF objects. Prepares the BalanceDF objects by using their raw df (with NA indicators), and then passes the df and weights into :func:`weighted_comparisons_stats.cvmd`. Args: sample_BalanceDF (BalanceDF): Object. target_BalanceDF (BalanceDF): Object. aggregate_by_main_covar (bool, optional): See :func:`weighted_comparisons_stats.cvmd`. Defaults to False. Returns: pd.Series: See :func:`weighted_comparisons_stats.cvmd`. """ return BalanceDF._apply_comparison_stat_to_BalanceDF( weighted_comparisons_stats.cvmd, sample_BalanceDF, target_BalanceDF, aggregate_by_main_covar, use_model_matrix=False, ) @staticmethod def _ks_BalanceDF( sample_BalanceDF: "BalanceDF", target_BalanceDF: "BalanceDF", aggregate_by_main_covar: bool = False, ) -> pd.Series: """Run KS on two BalanceDF objects. Prepares the BalanceDF objects by using their raw df (with NA indicators), and then passes the df and weights into :func:`weighted_comparisons_stats.ks`. Args: sample_BalanceDF (BalanceDF): Object. target_BalanceDF (BalanceDF): Object. aggregate_by_main_covar (bool, optional): See :func:`weighted_comparisons_stats.ks`. Defaults to False. Returns: pd.Series: See :func:`weighted_comparisons_stats.ks`. """ return BalanceDF._apply_comparison_stat_to_BalanceDF( weighted_comparisons_stats.ks, sample_BalanceDF, target_BalanceDF, aggregate_by_main_covar, use_model_matrix=False, )
[docs] def asmd( self: "BalanceDF", on_linked_samples: bool = True, target: "BalanceDF" | None = None, aggregate_by_main_covar: bool = False, **kwargs: Any, ) -> pd.DataFrame: """ASMD is the absolute difference of the means of two groups (say, P and T), divided by some standard deviation (std). It can be std of P or of T, or of P and T. These are all variations on the absolute value of cohen's d (see: https://en.wikipedia.org/wiki/Effect_size#Cohen's_d). We can use asmd to compares multiple Samples (with and without adjustment) to a target population. Args: self (BalanceDF): Object from sample (with/without adjustment, but it needs some target) on_linked_samples (bool, optional): If to compare also to linked sample objects (specifically: unadjusted), or not. Defaults to True. target (Optional["BalanceDF"], optional): A BalanceDF (of the same type as the one used in self) to compare against. If None then it looks for a target in the self linked objects. Defaults to None. aggregate_by_main_covar (bool, optional): Defaults to False. If True, it will make sure to return the asmd DataFrame after averaging all the columns from using the one-hot encoding for categorical variables. See ::_aggregate_statistic_by_main_covar:: for more details. Raises: ValueError: If self has no target and no target is supplied. Returns: pd.DataFrame: If on_linked_samples is False, then only one row (index name depends on BalanceDF type, e.g.: covars), with asmd of self vs the target (depending if it's covars, or something else). If on_linked_samples is True, then two rows per source (self, unadjusted), each with the asmd compared to target, and a third row for the difference (self-unadjusted). Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample from copy import deepcopy s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") s3_null_madeup_weights = deepcopy(s3_null) s3_null_madeup_weights.set_weights((1, 2, 3, 1)) print(s3_null.covars().asmd().round(3)) # a b c[v] c[x] c[y] c[z] mean(asmd) # source # self 0.56 8.747 NaN 0.069 0.266 0.533 3.175 # unadjusted 0.56 8.747 NaN 0.069 0.266 0.533 3.175 # unadjusted - self 0.00 0.000 NaN 0.000 0.000 0.000 0.000 # show that on_linked_samples = False works: print(s3_null.covars().asmd(on_linked_samples = False).round(3)) # a b c[v] c[x] c[y] c[z] mean(asmd) # index # covars 0.56 8.747 NaN 0.069 0.266 0.533 3.175 # verify this also works when we have some weights print(s3_null_madeup_weights.covars().asmd()) # a b c[v] ... c[y] c[z] mean(asmd) # source ... # self 0.296500 8.153742 NaN ... 0.000000 0.218218 2.834932 # unadjusted 0.560055 8.746742 NaN ... 0.265606 0.533422 3.174566 # unadjusted - self 0.263555 0.592999 NaN ... 0.265606 0.315204 0.33963 """ target_from_self = self._balancedf_child_from_linked_samples().get("target") if target is None: target = target_from_self if target is None: raise ValueError( f"Sample {object.__str__(self._sample)} has no target set, or target has no {self.__name} to compare against." ) elif on_linked_samples: return balance_util.row_pairwise_diffs( self._call_on_linked( "asmd", exclude=("target",), target=target, aggregate_by_main_covar=aggregate_by_main_covar, **kwargs, ) ) else: out = ( pd.DataFrame( self._asmd_BalanceDF(self, target, aggregate_by_main_covar) ) .transpose() .assign(index=(self.__name,)) .set_index("index") ) return out
[docs] def kld( self: "BalanceDF", on_linked_samples: bool = True, target: "BalanceDF" | None = None, aggregate_by_main_covar: bool = False, **kwargs: Any, ) -> pd.DataFrame: """Calculate KL divergence (KLD) to measure distributional differences between samples and target. KLD is the Kullback-Leibler divergence, an asymmetric measure of how one probability distribution differs from a reference distribution. Unlike ASMD which measures standardized mean differences, KLD captures differences in the entire distribution shape. We can use kld to compare multiple Samples (with and without adjustment) to a target population. Args: self (BalanceDF): Object from sample (with/without adjustment, but it needs some target) on_linked_samples (bool, optional): If to compare also to linked sample objects (specifically: unadjusted), or not. Defaults to True. If True, then uses :func:`_call_on_linked` with method "kld". If False, then uses :func:`_kld_BalanceDF` directly. target (Optional["BalanceDF"], optional): A BalanceDF (of the same type as the one used in self) to compare against. If None then it looks for a target in the self linked objects. Defaults to None. aggregate_by_main_covar (bool, optional): Defaults to False. If True, it will return the KLD results after aggregating columns that share a main covariate name into a single metric. This applies to columns created from the same underlying covariate via formula transformations or interactions (e.g., ``"age"``, ``"log(age)"``, ``"age:squared"``), not specifically to categorical one-hot encoding. See :func:`_aggregate_statistic_by_main_covar` for more details. Raises: ValueError: If self has no target and no target is supplied. Returns: pd.DataFrame: If on_linked_samples is False, then only one row (index name depends on BalanceDF type, e.g.: covars), with kld of self vs the target. If on_linked_samples is True, then two rows per source (self, unadjusted), each with the kld compared to target, and a third row for the difference (self-unadjusted). Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample # Create a simple example with numeric and categorical variables sample = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (1, 2, 3, 4), "c": ("x", "x", "y", "z"), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) target = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (1, 2, 2, 3), "c": ("x", "y", "y", "z"), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) sample_with_target = sample.set_target(target) # Compare only self to target without linked samples print(sample_with_target.covars().kld(on_linked_samples=False).round(6)) # a c mean(kld) # index # covars 0.0 0.173287 0.086643 """ target_from_self = self._balancedf_child_from_linked_samples().get("target") if target is None: target = target_from_self if target is None: raise ValueError( f"Sample {object.__str__(self._sample)} has no target set, or target has no {self.__name} to compare against." ) elif on_linked_samples: return balance_util.row_pairwise_diffs( self._call_on_linked( "kld", exclude=("target",), target=target, aggregate_by_main_covar=aggregate_by_main_covar, **kwargs, ) ) else: out = ( pd.DataFrame(self._kld_BalanceDF(self, target, aggregate_by_main_covar)) .transpose() .assign(index=(self.__name,)) .set_index("index") ) return out
[docs] def emd( self: "BalanceDF", on_linked_samples: bool = True, target: "BalanceDF" | None = None, aggregate_by_main_covar: bool = False, **kwargs: Any, ) -> pd.DataFrame: """Calculate Earth Mover's Distance (EMD) to compare distributions to a target. See: https://en.wikipedia.org/wiki/Earth_mover%27s_distance Args: self (BalanceDF): Object from sample (with/without adjustment, but it needs some target). on_linked_samples (bool, optional): If to compare also to linked sample objects (specifically: unadjusted). If True, then uses :func:`_call_on_linked` with method "emd". Defaults to True. target (Optional["BalanceDF"], optional): A BalanceDF (of the same type as the one used in self) to compare against. If None then it looks for a target in the self linked objects. Defaults to None. aggregate_by_main_covar (bool, optional): Defaults to False. If True, it will make sure to return the emd DataFrame after averaging all the columns from using the one-hot encoding for categorical variables. See :func:`_aggregate_statistic_by_main_covar` for more details. Raises: ValueError: If self has no target and no target is supplied. Returns: pd.DataFrame: If on_linked_samples is False, then only one row (index name depends on BalanceDF type, e.g.: covars), with emd of self vs the target. If on_linked_samples is True, then two rows per source (self, unadjusted), each with the emd compared to target, and a third row for the difference (self-unadjusted). Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (1, 2, 3, 4), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) target = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (2, 3, 4, 5), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) sample.set_target(target).covars().emd(on_linked_samples=False) """ target_from_self = self._balancedf_child_from_linked_samples().get("target") if target is None: target = target_from_self if target is None: raise ValueError( f"Sample {object.__str__(self._sample)} has no target set, or target has no {self.__name} to compare against." ) elif on_linked_samples: return balance_util.row_pairwise_diffs( self._call_on_linked( "emd", exclude=("target",), target=target, aggregate_by_main_covar=aggregate_by_main_covar, **kwargs, ) ) else: out = ( pd.DataFrame(self._emd_BalanceDF(self, target, aggregate_by_main_covar)) .transpose() .assign(index=(self.__name,)) .set_index("index") ) return out
[docs] def cvmd( self: "BalanceDF", on_linked_samples: bool = True, target: "BalanceDF" | None = None, aggregate_by_main_covar: bool = False, **kwargs: Any, ) -> pd.DataFrame: """Calculate Cramér-von Mises distance (CVMD) to compare distributions to a target. See: https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion Args: self (BalanceDF): Object from sample (with/without adjustment, but it needs some target). on_linked_samples (bool, optional): If to compare also to linked sample objects (specifically: unadjusted). If True, then uses :func:`_call_on_linked` with method "cvmd". Defaults to True. target (Optional["BalanceDF"], optional): A BalanceDF (of the same type as the one used in self) to compare against. If None then it looks for a target in the self linked objects. Defaults to None. aggregate_by_main_covar (bool, optional): Defaults to False. If True, it will make sure to return the cvmd DataFrame after averaging all the columns from using the one-hot encoding for categorical variables. See :func:`_aggregate_statistic_by_main_covar` for more details. Raises: ValueError: If self has no target and no target is supplied. Returns: pd.DataFrame: If on_linked_samples is False, then only one row (index name depends on BalanceDF type, e.g.: covars), with cvmd of self vs the target. If on_linked_samples is True, then two rows per source (self, unadjusted), each with the cvmd compared to target, and a third row for the difference (self-unadjusted). Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (1, 2, 3, 4), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) target = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (2, 3, 4, 5), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) sample.set_target(target).covars().cvmd(on_linked_samples=False) """ target_from_self = self._balancedf_child_from_linked_samples().get("target") if target is None: target = target_from_self if target is None: raise ValueError( f"Sample {object.__str__(self._sample)} has no target set, or target has no {self.__name} to compare against." ) elif on_linked_samples: return balance_util.row_pairwise_diffs( self._call_on_linked( "cvmd", exclude=("target",), target=target, aggregate_by_main_covar=aggregate_by_main_covar, **kwargs, ) ) else: out = ( pd.DataFrame( self._cvmd_BalanceDF(self, target, aggregate_by_main_covar) ) .transpose() .assign(index=(self.__name,)) .set_index("index") ) return out
[docs] def ks( self: "BalanceDF", on_linked_samples: bool = True, target: "BalanceDF" | None = None, aggregate_by_main_covar: bool = False, **kwargs: Any, ) -> pd.DataFrame: """Calculate Kolmogorov-Smirnov (KS) statistic to compare distributions to a target. See: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test Args: self (BalanceDF): Object from sample (with/without adjustment, but it needs some target). on_linked_samples (bool, optional): If to compare also to linked sample objects (specifically: unadjusted). If True, then uses :func:`_call_on_linked` with method "ks". Defaults to True. target (Optional["BalanceDF"], optional): A BalanceDF (of the same type as the one used in self) to compare against. If None then it looks for a target in the self linked objects. Defaults to None. aggregate_by_main_covar (bool, optional): Defaults to False. If True, it will make sure to return the ks DataFrame after averaging all the columns from using the one-hot encoding for categorical variables. See :func:`_aggregate_statistic_by_main_covar` for more details. Raises: ValueError: If self has no target and no target is supplied. Returns: pd.DataFrame: If on_linked_samples is False, then only one row (index name depends on BalanceDF type, e.g.: covars), with ks of self vs the target. If on_linked_samples is True, then two rows per source (self, unadjusted), each with the ks compared to target, and a third row for the difference (self-unadjusted). Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (1, 2, 3, 4), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) target = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "a": (2, 3, 4, 5), "w": (1, 1, 1, 1), } ), id_column="id", weight_column="w", ) sample.set_target(target).covars().ks(on_linked_samples=False) """ target_from_self = self._balancedf_child_from_linked_samples().get("target") if target is None: target = target_from_self if target is None: raise ValueError( f"Sample {object.__str__(self._sample)} has no target set, or target has no {self.__name} to compare against." ) elif on_linked_samples: return balance_util.row_pairwise_diffs( self._call_on_linked( "ks", exclude=("target",), target=target, aggregate_by_main_covar=aggregate_by_main_covar, **kwargs, ) ) else: out = ( pd.DataFrame(self._ks_BalanceDF(self, target, aggregate_by_main_covar)) .transpose() .assign(index=(self.__name,)) .set_index("index") ) return out
[docs] def asmd_improvement( self: "BalanceDF", unadjusted: "BalanceDF" | None = None, target: "BalanceDF" | None = None, ) -> np.float64: """Calculates the improvement in mean(asmd) from before to after applying some weight adjustment. See :func:`weighted_comparisons_stats.asmd_improvement` for details. Args: self (BalanceDF): BalanceDF (e.g.: of self after adjustment) unadjusted (Optional["BalanceDF"], optional): BalanceDF (e.g.: of self before adjustment). Defaults to None. target (Optional["BalanceDF"], optional): To compare against. Defaults to None. Raises: ValueError: If target is not linked in self and also not provided to the function. ValueError: If unadjusted is not linked in self and also not provided to the function. Returns: np.float64: The improvement is taking the (before_mean_asmd-after_mean_asmd)/before_mean_asmd. The asmd is calculated using :func:`asmd`. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample from copy import deepcopy s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) s2 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3), "b": (4, 6, 8), "id": (1, 2, 3), "w": (0.5, 1, 2), "c": ("x", "y", "z"), } ), id_column="id", weight_column="w", ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") s3_null_madeup_weights = deepcopy(s3_null) s3_null_madeup_weights.set_weights((1, 2, 3, 1)) s3_null.covars().asmd_improvement() # 0. since unadjusted is just a copy of self s3_null_madeup_weights.covars().asmd_improvement() # 0.10698596233975825 asmd_df = s3_null_madeup_weights.covars().asmd() print(asmd_df["mean(asmd)"]) # source # self 2.834932 # unadjusted 3.174566 # unadjusted - self 0.339634 # Name: mean(asmd), dtype: float64 (asmd_df["mean(asmd)"][1] - asmd_df["mean(asmd)"][0]) / asmd_df["mean(asmd)"][1] # 0.10698596233975825 # just like asmd_improvement """ if unadjusted is None: unadjusted = self._balancedf_child_from_linked_samples().get("unadjusted") if unadjusted is None: raise ValueError( f"Sample {object.__repr__(self._sample)} has no unadjusted set or unadjusted has no {self.__name}." ) if target is None: target = self._balancedf_child_from_linked_samples().get("target") if target is None: raise ValueError( f"Sample {object.__repr__(self._sample)} has no target set or target has no {self.__name}." ) sample_before_df, sample_before_weights = unadjusted._get_df_and_weights() sample_after_df, sample_after_weights = self._get_df_and_weights() target_df, target_weights = target._get_df_and_weights() return weighted_comparisons_stats.asmd_improvement( sample_before=sample_before_df, sample_after=sample_after_df, target=target_df, sample_before_weights=sample_before_weights, sample_after_weights=sample_after_weights, target_weights=target_weights, )
def _df_with_ids(self: "BalanceDF") -> pd.DataFrame: """Creates a DataFrame of the BalanceDF, with ids. Args: self (BalanceDF): Object. Returns: pd.DataFrame: DataFrame with id_column and then the df. """ return pd.concat((self._sample.id_series, self.df), axis=1)
[docs] def to_csv( self: "BalanceDF", path_or_buf: FilePathOrBuffer | None = None, *args: Any, **kwargs: Any, ) -> str | None: """Write df with ids from BalanceDF to a comma-separated values (csv) file. Uses :func:`pd.DataFrame.to_csv`. If an 'index' argument is not provided then it defaults to False. Args: self (BalanceDF): Object. path_or_buf (Optional[FilePathOrBuffer], optional): location where to save the csv. Returns: Optional[str]: If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( {"id": ["1", "2"], "x": [0, 1], "weight": [1.0, 2.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) csv_text = sample.covars().to_csv() "id" in csv_text # True """ return to_csv_with_defaults(self._df_with_ids(), path_or_buf, *args, **kwargs)
[docs] class BalanceDFOutcomes(BalanceDF): def __init__( self: "BalanceDFOutcomes", sample: BalanceDFSource, links: dict[str, BalanceDFSource] | None = None, ) -> None: """A factory function to create BalanceDFOutcomes This is used through :func:`Sample.outcomes`. It initiates a BalanceDFOutcomes object by passing the relevant arguments to :func:`BalanceDF.__init__`. Args: self (BalanceDFOutcomes): Object that is initiated. sample (BalanceDFSource): A BalanceDFSource-compatible object (e.g. Sample, SampleFrame). links (Dict | None): Optional explicit links for BalanceDF. """ outcome_df = sample._outcome_columns if outcome_df is None: source_type = type(sample).__name__ raise ValueError( f"Cannot create BalanceDFOutcomes: no outcome columns are defined " f"on the provided {source_type}. Configure the underlying object " "with outcome_columns= before constructing or accessing outcomes." ) super().__init__(outcome_df, sample, name="outcomes", links=links) # TODO: add the `relative_to` argument (with options 'self' and 'target') # this will also require to update _relative_response_rates a bit.
[docs] def relative_response_rates( self: "BalanceDFOutcomes", target: bool | pd.DataFrame = False, per_column: bool = False, ) -> pd.DataFrame | None: """Produces a summary table of number of responses and proportion of completed responses. See :func:`general_stats.relative_response_rates`. Args: self (BalanceDFOutcomes): Object target (Union[bool, pd.DataFrame], optional): Defaults to False. Determines what is passed to df_target in :func:`general_stats.relative_response_rates` If False: passes None. If True: passes the df from the target of sample (notice, it's the df of target, NOT target.outcome().df). So it means it will count only rows that are all notnull rows (so if the target has covars and outcomes, both will need to be notnull to be counted). If you want to control this in a more specific way, pass pd.DataFrame instead. If pd.DataFrame: passes it as is. per_column (bool, optional): Default is False. See :func:`general_stats.relative_response_rates`. Returns: Optional[pd.DataFrame]: A column per outcome, and two rows. One row with number of non-null observations, and A second row with the proportion of non-null observations. If 'target' is set to True but there is no target, the function returns None. Examples: .. code-block:: python import numpy as np import pandas as pd from balance.sample_class import Sample s_o = Sample.from_frame( pd.DataFrame({"o1": (7, 8, 9, 10), "o2": (7, 8, 9, np.nan), "id": (1, 2, 3, 4)}), id_column="id", outcome_columns=("o1", "o2"), ) print(s_o.outcomes().relative_response_rates()) # o1 o2 # n 4.0 3.0 # % 100.0 75.0 s_o.outcomes().relative_response_rates(target = True) # None # compared with a larger target t_o = Sample.from_frame( pd.DataFrame( { "o1": (7, 8, 9, 10, 11, 12, 13, 14), "o2": (7, 8, 9, np.nan, np.nan, 12, 13, 14), "id": (1, 2, 3, 4, 5, 6, 7, 8), } ), id_column="id", outcome_columns=("o1", "o2"), ) s_o2 = s_o.set_target(t_o) print(s_o2.outcomes().relative_response_rates(True, per_column = True)) # o1 o2 # n 4.0 3.0 # % 50.0 50.0 df_target = pd.DataFrame( { "o1": (7, 8, 9, 10, 11, 12, 13, 14), "o2": (7, 8, 9, np.nan, np.nan, 12, 13, 14), } ) print(s_o2.outcomes().relative_response_rates(target = df_target, per_column = True)) # o1 o2 # n 4.0 3.0 # % 50.0 50.0 """ if type(target) is bool: # Then: get target from self: if target: self_target = self._balancedf_child_from_linked_samples().get("target") if self_target is None: logger.warning("Sample does not have target set") return None else: df_target = self_target.df else: df_target = None else: df_target = target return general_stats.relative_response_rates( self.df, df_target, per_column=per_column )
[docs] def target_response_rates(self: "BalanceDFOutcomes") -> pd.DataFrame | None: """Calculates relative_response_rates for the target in a Sample object. See :func:`general_stats.relative_response_rates`. Args: self (BalanceDFOutcomes): Object (with/without a target set) Returns: Optional[pd.DataFrame]: None if the object doesn't have a target. If the object has a target, it returns the output of :func:`general_stats.relative_response_rates`. Examples: .. code-block:: python import numpy as np import pandas as pd from balance.sample_class import Sample s_o = Sample.from_frame( pd.DataFrame({"o1": (7, 8, 9, 10), "o2": (7, 8, 9, np.nan), "id": (1, 2, 3, 4)}), id_column="id", outcome_columns=("o1", "o2"), ) t_o = Sample.from_frame( pd.DataFrame( { "o1": (7, 8, 9, 10, 11, 12, 13, 14), "o2": (7, 8, 9, np.nan, 11, 12, 13, 14), "id": (1, 2, 3, 4, 5, 6, 7, 8), } ), id_column="id", outcome_columns=("o1", "o2"), ) s_o = s_o.set_target(t_o) print(s_o.outcomes().target_response_rates()) # o1 o2 # n 8.0 7.0 # % 100.0 87.5 """ self_target = self._balancedf_child_from_linked_samples().get("target") if self_target is None: logger.warning("Sample does not have target set") return None else: return general_stats.relative_response_rates(self_target.df)
[docs] def weights_impact_on_outcome_ss( self: "BalanceDFOutcomes", method: str = "t_test", conf_level: float = 0.95, round_ndigits: int | None = 3, w0: npt.NDArray | pd.Series | None = None, w1: npt.NDArray | pd.Series | None = None, ) -> pd.DataFrame | None: """ Compare weighted outcomes using paired tests on y*w0 vs y*w1. Args: self (BalanceDFOutcomes): Object. method (str, optional): Statistical test to use. Defaults to "t_test". conf_level (float, optional): Confidence level for the mean difference interval. Defaults to 0.95. round_ndigits (int | None, optional): Optional rounding for numeric outputs. Defaults to 3. w0: Baseline weights. Defaults to a vector of ones. w1: Alternative weights. Defaults to the BalanceDF weights. Returns: Optional[pd.DataFrame]: Outcome-by-statistic table or None if weights are missing. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( { "id": (1, 2, 3, 4), "weight": (1.0, 2.0, 1.0, 2.0), "outcome": (1.0, 2.0, 3.0, 4.0), } ), id_column="id", weight_column="weight", outcome_columns=("outcome",), standardize_types=False, ) impact = sample.outcomes().weights_impact_on_outcome_ss( method="t_test", round_ndigits=3 ) print(impact.to_string()) .. code-block:: text mean_yw0 mean_yw1 mean_diff diff_ci_lower diff_ci_upper t_stat p_value n outcome outcome 2.5 4.0 1.5 -1.547 4.547 1.567 0.215 4.0 """ if w1 is None: if self._weights is None: logger.warning("No weights available for outcome impact analysis.") return None w1 = self._weights w1_values = ( w1.to_numpy() if isinstance(w1, pd.Series) else np.asarray(w1, dtype=float) ) if w0 is None: w0_values = np.ones_like(w1_values, dtype=float) else: w0_values = ( w0.to_numpy() if isinstance(w0, pd.Series) else np.asarray(w0, dtype=float) ) model_matrix = self.model_matrix() results = {} for column in model_matrix.columns: results[column] = impact_of_weights_on_outcome.weights_impact_on_outcome_ss( y=model_matrix[column].to_numpy(), w0=w0_values, w1=w1_values, method=method, conf_level=conf_level, ) impact_df = pd.DataFrame(results).T impact_df.index.name = "outcome" if round_ndigits is not None: numeric_cols = impact_df.select_dtypes(include=["number"]).columns impact_df[numeric_cols] = impact_df[numeric_cols].round(round_ndigits) return impact_df
# TODO: it's a question if summary should produce a printable output or a DataFrame. # The BalanceDF.summary method only returns a DataFrame. So it's a question # what is the best way to structure this more generally.
[docs] def summary( self: "BalanceDFOutcomes", on_linked_samples: bool | None = None, weights_impact_method: str | None = "t_test", weights_impact_conf_level: float = 0.95, ) -> str: """Produces summary printable string of a BalanceDFOutcomes object. Args: self (BalanceDFOutcomes): Object. on_linked_samples (Optional[bool]): Ignored. Only here since summary overrides BalanceDF.summary. weights_impact_method (Optional[str]): If provided, include a paired test of y*w0 vs y*w1 for each outcome (default is "t_test"). weights_impact_conf_level (float): Confidence level for the mean difference interval when weights_impact_method is provided. Defaults to 0.95. Returns: str: A printable string, with mean of outcome variables and response rates. Examples: .. code-block:: python import numpy as np import pandas as pd from balance.sample_class import Sample s_o = Sample.from_frame( pd.DataFrame({"o1": (7, 8, 9, 10), "o2": (7, 8, 9, np.nan), "id": (1, 2, 3, 4)}), id_column="id", outcome_columns=("o1", "o2"), ) t_o = Sample.from_frame( pd.DataFrame( { "o1": (7, 8, 9, 10, 11, 12, 13, 14), "o2": (7, 8, 9, np.nan, np.nan, 12, 13, 14), "id": (1, 2, 3, 4, 5, 6, 7, 8), } ), id_column="id", outcome_columns=("o1", "o2"), ) s_o2 = s_o.set_target(t_o) print(s_o.outcomes().summary()) # 2 outcomes: ['o1' 'o2'] # Mean outcomes (with 95% confidence intervals): # source self self # _is_na_o2[False] 0.75 (0.326, 1.174) # _is_na_o2[True] 0.25 (-0.174, 0.674) # o1 8.50 (7.404, 9.596) # o2 6.00 (2.535, 9.465) # Weights impact on outcomes (t_test): # mean_yw0 mean_yw1 mean_diff diff_ci_lower diff_ci_upper t_stat p_value n # outcome # _is_na_o2[False] 0.75 0.75 0.0 0.0 0.0 NaN NaN 4.0 # _is_na_o2[True] 0.25 0.25 0.0 0.0 0.0 NaN NaN 4.0 # o1 8.50 8.50 0.0 0.0 0.0 NaN NaN 4.0 # o2 6.00 6.00 0.0 0.0 0.0 NaN NaN 4.0 # Response rates (relative to number of respondents in sample): # o1 o2 # n 4.0 3.0 # % 100.0 75.0 print(s_o2.outcomes().summary()) # 2 outcomes: ['o1' 'o2'] # Mean outcomes (with 95% confidence intervals): # source self target self target # _is_na_o2[False] 0.75 0.750 (0.326, 1.174) (0.45, 1.05) # _is_na_o2[True] 0.25 0.250 (-0.174, 0.674) (-0.05, 0.55) # o1 8.50 10.500 (7.404, 9.596) (8.912, 12.088) # o2 6.00 7.875 (2.535, 9.465) (4.351, 11.399) # Weights impact on outcomes (t_test): # mean_yw0 mean_yw1 mean_diff diff_ci_lower diff_ci_upper t_stat p_value n # outcome # _is_na_o2[False] 0.75 0.75 0.0 0.0 0.0 NaN NaN 4.0 # _is_na_o2[True] 0.25 0.25 0.0 0.0 0.0 NaN NaN 4.0 # o1 8.50 8.50 0.0 0.0 0.0 NaN NaN 4.0 # o2 6.00 6.00 0.0 0.0 0.0 NaN NaN 4.0 # Response rates (relative to number of respondents in sample): # o1 o2 # n 4.0 3.0 # % 100.0 75.0 # Response rates (relative to notnull rows in the target): # o1 o2 # n 4.000000 3.0 # % 66.666667 50.0 # Response rates (in the target): # o1 o2 # n 8.0 6.0 # % 100.0 75.0 """ mean_outcomes_with_ci = self.mean_with_ci() relative_response_rates = self.relative_response_rates() target_response_rates = self.target_response_rates() weights_impact_clause = "" if weights_impact_method is not None: weights_impact = self.weights_impact_on_outcome_ss( method=weights_impact_method, conf_level=weights_impact_conf_level, ) if weights_impact is not None: weights_impact_clause = ( "Weights impact on outcomes " f"({weights_impact_method}):\n" f"{weights_impact.to_string(max_cols=None)}\n\n" ) if target_response_rates is None: target_clause = "" relative_to_target_clause = "" else: relative_to_target_response_rates = self.relative_response_rates( target=True, per_column=False ) relative_to_target_clause = f"Response rates (relative to notnull rows in the target):\n {relative_to_target_response_rates}" target_clause = f"Response rates (in the target):\n {target_response_rates}" n_outcomes = self.df.shape[1] list_outcomes = np.array(self.df.columns, dtype=object) mean_outcomes_with_ci = mean_outcomes_with_ci relative_response_rates = relative_response_rates target_clause = target_clause out = ( f"{n_outcomes} outcomes: {list_outcomes}\n" f"Mean outcomes (with 95% confidence intervals):\n" # TODO: in the future consider if to add an argument to transpose (.T) the output, in case there are multiple outcomes. f"{mean_outcomes_with_ci.to_string(max_cols=None)}\n\n" f"{weights_impact_clause}" "Response rates (relative to number of respondents in sample):\n" f"{relative_response_rates}\n" f"{relative_to_target_clause}\n" f"{target_clause}\n" ) return out
[docs] def outcome_sd_prop(self: "BalanceDFOutcomes") -> pd.Series: """Relative change in outcome weighted SD after adjustment. Computes (weighted SD of adjusted - weighted SD of unadjusted) / weighted SD of unadjusted. Returns: pd.Series: Relative difference in outcome weighted standard deviation. Raises: ValueError: If there are no unadjusted outcomes linked. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( { "id": ["1", "2"], "x": [0, 1], "weight": [1.0, 2.0], "y": [0.1, 0.2], } ), id_column="id", weight_column="weight", outcome_columns="y", standardize_types=False, ) target = Sample.from_frame( pd.DataFrame( {"id": ["3", "4"], "x": [0, 1], "weight": [1.0, 1.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) adjusted = sample.set_target(target).adjust(method="null") adjusted.outcomes().outcome_sd_prop() """ outcome_std = self.std() adjusted_outcome_sd = outcome_std.loc["self"] unadjusted_row = outcome_std.reindex(["unadjusted"]) if unadjusted_row.isna().all(axis=None): raise ValueError( "No unadjusted outcomes available. This requires an adjusted sample." ) unadjusted_outcome_sd = outcome_std.loc["unadjusted"] return (adjusted_outcome_sd - unadjusted_outcome_sd) / unadjusted_outcome_sd
[docs] def outcome_variance_ratio(self: "BalanceDFOutcomes") -> pd.Series: """Ratio of outcome variance (adjusted / unadjusted). See :func:`outcome_variance_ratio` for details. Returns: pd.Series: A series of calculated ratio of variances for each outcome. Raises: ValueError: If there are no unadjusted outcomes linked. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( { "id": ["1", "2"], "x": [0, 1], "weight": [1.0, 2.0], "y": [0.1, 0.2], } ), id_column="id", weight_column="weight", outcome_columns="y", standardize_types=False, ) target = Sample.from_frame( pd.DataFrame( {"id": ["3", "4"], "x": [0, 1], "weight": [1.0, 1.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) adjusted = sample.set_target(target).adjust(method="null") adjusted.outcomes().outcome_variance_ratio() """ linked = self._balancedf_child_from_linked_samples() unadjusted_outcomes = linked.get("unadjusted") if unadjusted_outcomes is None: raise ValueError( "No unadjusted outcomes available. This requires an adjusted sample." ) adjusted_w = self._weights adjusted_weights = adjusted_w.values if adjusted_w is not None else None unadjusted_w = unadjusted_outcomes._weights unadjusted_weights = unadjusted_w.values if unadjusted_w is not None else None return _outcome_variance_ratio( self.df, unadjusted_outcomes.df, adjusted_weights, unadjusted_weights, )
[docs] class BalanceDFCovars(BalanceDF): def __init__( self: "BalanceDFCovars", sample: BalanceDFSource, links: dict[str, BalanceDFSource] | None = None, formula: str | list[str] | None = None, ) -> None: """A factory function to create BalanceDFCovars This is used through :func:`Sample.covars`. It initiates a BalanceDFCovars object by passing the relevant arguments to :func:`BalanceDF.__init__`. Args: self (BalanceDFCovars): Object that is initiated. sample (BalanceDFSource): A BalanceDFSource-compatible object (e.g. Sample, SampleFrame). links (Dict | None): Optional explicit links for BalanceDF. formula (str | list[str] | None, optional): Optional formula to use as the default when constructing model matrices for this object. """ super().__init__(sample._covar_columns(), sample, name="covars", links=links) self._formula: str | list[str] | None = formula
[docs] def model_matrix( self: "BalanceDFCovars", formula: str | list[str] | None = None ) -> pd.DataFrame: """Return a model matrix, defaulting to the formula provided at construction.""" effective_formula = self._formula if formula is None else formula return super().model_matrix(formula=effective_formula)
def _linked_child_kwargs(self: "BalanceDFCovars") -> dict[str, Any]: """Propagate formula choice to linked covariate views.""" if self._formula is None: return {} return {"formula": self._formula} def _kld_formula(self: "BalanceDFCovars") -> str | list[str] | None: """Formula to use for KLD when comparing covariates.""" return self._formula
[docs] @classmethod def from_frame( cls: type["BalanceDFCovars"], df: pd.DataFrame, weights: pd.Series | None = None, formula: str | list[str] | None = None, ) -> "BalanceDFCovars": """A factory function to create a BalanceDFCovars from a df. Although generally the main way the object is created is through the __init__ method. This method is useful when you need to create a BalanceDFCovars object directly from a DataFrame. Args: cls (type[BalanceDFCovars]): Class object. df (pd.DataFrame): A df. weights (Optional[pd.Series], optional): _description_. Defaults to None. formula (str | list[str] | None, optional): Optional formula to set on the returned ``BalanceDFCovars`` object. Returns: BalanceDFCovars: Object. Examples: .. code-block:: python import pandas as pd from balance.balancedf_class import BalanceDFCovars covars = BalanceDFCovars.from_frame(pd.DataFrame({"a": [1, 2], "b": [3, 4]})) covars.df.columns.tolist() # ['index', 'a', 'b'] """ from balance.sample_class import ( # lazy import to avoid circular dependency Sample, ) df = df.reset_index() concat_list: list[pd.DataFrame | pd.Series] = [ df, pd.Series(np.arange(0, df.shape[0]), name="id"), ] if weights is not None: concat_list.append(weights) df = pd.concat(concat_list, axis=1) return Sample.from_frame(df, id_column="id").covars(formula=formula)
# Single source of truth for the supported love-plot metrics: the # ``Literal`` type drives the static (Pyre) check, and the runtime # tuple is derived from it via ``typing.get_args`` so the two cannot # drift. LovePlotMetric = Literal["asmd", "kld", "emd", "cvmd", "ks"] _LOVE_PLOT_METRICS: tuple[str, ...] = get_args(LovePlotMetric)
[docs] def love_plot( self: "BalanceDFCovars", *, metric: LovePlotMetric = "asmd", threshold: float | None = None, ax: Any | None = None, library: _love_plot_module.LovePlotLibrary = "plotly", line: bool = True, order_by: _love_plot_module.LovePlotOrderBy = "diff", show: bool = False, **kwargs: Any, ) -> Any: """Side-by-side imbalance scatter of unadjusted vs. adjusted covariates. A "Love plot" (after Thomas Love) is the canonical visual for showing how much each covariate's imbalance shrinks after applying weights. Reference: R's ``cobalt::love.plot``. Behaviour mirrors :meth:`asmd` (the chosen ``metric`` in general) rather than :meth:`asmd_improvement`: when no unadjusted view is linked (pre-adjust diagnostic case) the plot shows only the current weighted metric as a single-series scatter; when an unadjusted view is linked (post-adjust) the plot shows before-vs-after. Args: self (BalanceDFCovars): The covariates view to plot. Typically obtained via ``sample.covars()`` after ``adjust()``. metric (str, optional): Which imbalance metric to plot. One of ``"asmd"`` (default; mean-difference effect size, the cobalt default), ``"kld"`` (Kullback-Leibler divergence), ``"emd"`` (Earth Mover's Distance), ``"cvmd"`` (Cramér-von Mises distance), or ``"ks"`` (Kolmogorov-Smirnov distance). Each dispatches to the corresponding :meth:`BalanceDF` method (``asmd`` / ``kld`` / ``emd`` / ``cvmd`` / ``ks``). threshold (float | None, optional): Vertical reference line at ``+threshold``. ``None`` (the default) means "use a per-metric default": ``0.1`` for ASMD (the cobalt- convention "balance achieved" cutoff) and ``None`` (no line) for the other metrics, since none has a universally accepted cutoff. Pass an explicit float to override. ax (Any | None, optional): Optional matplotlib ``Axes`` to draw into for ``library="seaborn"``. If ``None``, a new figure is created sized to the number of covariates. library (str, optional): Plotting backend. ``"plotly"`` (default) returns an interactive Plotly ``Figure``; ``"seaborn"`` returns a static seaborn/matplotlib ``Axes``; ``"balance"`` returns an ASCII string. line (bool, optional): If ``True`` (the default) and an unadjusted view is available, connect each unweighted/weighted covariate pair with a horizontal line, similar to cobalt's ``line=TRUE``. order_by (str, optional): Sort covariates by ``"diff"`` (default; signed ``after - before``, so the most-worsened rise to the top and the most-improved sink to the bottom), ``"before"``, ``"after"``, ``"alphabetical"``, or ``"none"``. show (bool, optional): For ``library="plotly"``, whether to call ``fig.show()`` before returning the figure. **kwargs: Forwarded to the underlying :func:`balance.stats_and_plots.love_plot.love_plot` primitive (e.g. ``bar_width`` for ``library="balance"``, or Plotly layout options like ``title=`` for ``library="plotly"``). Returns: matplotlib.axes.Axes | plotly.graph_objects.Figure | str: A seaborn/matplotlib ``Axes`` for ``library="seaborn"``, a Plotly figure for ``library="plotly"``, or ASCII text for ``library="balance"``. The ``Any`` return annotation matches balance's plotting-helper convention and avoids eager plotting imports in this module's type signature. Raises: ValueError: If ``metric`` is not one of the supported names; if ``threshold`` is negative (propagated from the primitive in :mod:`balance.stats_and_plots.love_plot`); or if no target is set on the sample. Computing any of the metrics requires a target population to compare against, so ``love_plot()`` propagates the same "no target" error that :meth:`asmd` (and siblings) raises when no target is located in ``self`` or its linked views. """ if metric not in self._LOVE_PLOT_METRICS: raise ValueError( f"metric must be one of {self._LOVE_PLOT_METRICS}; got " f"{metric!r}." ) # Resolve the threshold default: ASMD has the canonical cobalt # cutoff at 0.1; the other metrics have no universally-accepted # default, so we draw no reference line unless the user supplies # one explicitly. threshold_resolved: float | None if threshold is None: threshold_resolved = 0.1 if metric == "asmd" else None else: threshold_resolved = threshold xlabel: str = metric.upper() # Dispatch to the corresponding ``BalanceDF`` metric method. All # five metrics share the ``(on_linked_samples, target, # aggregate_by_main_covar)`` signature, so a single ``getattr`` # works. metric_fn = getattr(self, metric) after_series: pd.Series = metric_fn(on_linked_samples=False).iloc[0] linked = self._balancedf_child_from_linked_samples() unadjusted = linked.get("unadjusted") if unadjusted is None: # Pre-adjust diagnostic: no "before" exists, plot the current # series as a single scatter. Pass it as ``before`` so the # primitive's single-series branch handles axis labelling and # the (optional) threshold reference line. return _love_plot_module.love_plot( before=after_series, after=None, xlabel=xlabel, threshold=threshold_resolved, ax=ax, library=library, line=line, order_by=order_by, show=show, **kwargs, ) # The ``unadjusted`` view's own ``_balancedf_child_from_linked_samples`` # does not include the target (it is a sibling, not a child of # ``unadjusted``). Pull the target off ``self``'s links and forward # it explicitly -- mirrors the pattern used by ``asmd_improvement``. before_series: pd.Series = getattr(unadjusted, metric)( on_linked_samples=False, target=linked.get("target") ).iloc[0] return _love_plot_module.love_plot( before=before_series, after=after_series, xlabel=xlabel, threshold=threshold_resolved, ax=ax, library=library, line=line, order_by=order_by, show=show, **kwargs, )
[docs] def plot( self: "BalanceDFCovars", on_linked_samples: bool = True, **kwargs: Any ) -> Any: """Plot covariates, including ``dist_type="love_plot"`` diagnostics. The default behaviour is inherited from :meth:`BalanceDF.plot` and draws covariate distributions. Passing ``dist_type="love_plot"`` (or the ``"love"`` alias) dispatches to :meth:`love_plot`, so calls such as ``adjusted.covars().plot(dist_type="love_plot", library="plotly")`` return the covariate-balance scatter directly. Args: self (BalanceDFCovars): The covariates view to plot. on_linked_samples (bool, optional): Used by distribution plots. Love plots use the object's fitted lineage directly and ignore this argument. **kwargs: Passed to :meth:`BalanceDF.plot` for distribution plots or :meth:`love_plot` for love plots. """ dist_type = kwargs.get("dist_type") if dist_type in ("love", "love_plot"): kwargs.pop("dist_type", None) return_dict_of_figures = kwargs.pop("return_dict_of_figures", False) if not isinstance(return_dict_of_figures, bool): raise TypeError( "return_dict_of_figures must be a bool when using " "dist_type='love_plot'." ) # ``plot_it`` is a distribution-plot kwarg (see plot_dist); for # love plots the equivalent is ``show`` (only meaningful for # ``library="plotly"``). Translate to avoid passing ``plot_it`` # through to Plotly's ``update_layout`` where it is an invalid # property. if "plot_it" in kwargs: plot_it = kwargs.pop("plot_it") library = kwargs.get("library", "plotly") if library == "plotly": kwargs.setdefault("show", plot_it) else: logger.warning( "plot_it is only meaningful for library='plotly' " "love plots; ignoring for library=%r.", library, ) result = self.love_plot(**kwargs) if return_dict_of_figures and kwargs.get("library", "plotly") == "plotly": return {"love_plot": result} if return_dict_of_figures: logger.warning( "return_dict_of_figures=True is only supported for " "library='plotly' love plots; returning the plot object directly." ) return result return super().plot(on_linked_samples=on_linked_samples, **kwargs)
[docs] class BalanceDFWeights(BalanceDF): def __init__( self: "BalanceDFWeights", sample: BalanceDFSource, links: dict[str, BalanceDFSource] | None = None, ) -> None: """A factory function to create BalanceDFWeights This is used through :func:`Sample.weights`. It initiates a BalanceDFWeights object by passing the relevant arguments to :func:`BalanceDF.__init__`. Args: self (BalanceDFWeights): Object that is initiated. sample (BalanceDFSource): A BalanceDFSource-compatible object (e.g. Sample, SampleFrame). links (Dict | None): Optional explicit links for BalanceDF. """ super().__init__( sample.weight_series.to_frame(), sample, name="weights", links=links ) @property def df(self: "BalanceDFWeights") -> pd.DataFrame: """Return the current weight column as a DataFrame. Args: self (BalanceDFWeights): The BalanceDFWeights instance. Returns: pd.DataFrame: DataFrame containing the current weight column. """ return self._sample.weight_series.to_frame() # TODO: maybe add better control if there are no weights for unadjusted or target (the current default shows them in the legend, but not in the figure)
[docs] def plot( self: "BalanceDFWeights", on_linked_samples: bool = True, **kwargs: Any ) -> list[Any] | npt.NDArray[Any] | dict[str, Figure] | str | None: """Plots kde (kernal density estimation) of the weights in a BalanceDFWeights object using seaborn (as default). It's possible to use other plots using dist_type with arguments such as "hist", "kde" (default), "qq", and "ecdf". Look at :func:`plot_dist` for more details. Args: self (BalanceDFWeights): a BalanceDFOutcomes object, with a set of variables. on_linked_samples (bool, optional): Determines if the linked samples should be included in the plot. Defaults to True. Returns: list | np.ndarray | dict[str, Figure] | None: If library="plotly" then returns a dictionary containing plots if return_dict_of_figures is True. None otherwise. If library="seaborn" then returns None, unless return_axes is True. Then either a list or an np.array of matplotlib axis. Examples: .. code-block:: python import numpy as np import pandas as pd from numpy import random from balance.sample_class import Sample random.seed(96483) df = pd.DataFrame({ "id": range(100), 'v1': random.random_integers(11111, 11114, size=100).astype(str), 'v2': random.normal(size = 100), 'v3': random.uniform(size = 100), "w": pd.Series(np.ones(99).tolist() + [1000]), }).sort_values(by=['v2']) s1 = Sample.from_frame(df, id_column="id", weight_column="w", outcome_columns=["v1", "v2"], ) s2 = Sample.from_frame( df.assign(w = pd.Series(np.ones(100))), id_column="id", weight_column="w", outcome_columns=["v1", "v2"], ) s3 = s1.set_target(s2) s3_null = s3.adjust(method="null") s3_null.set_weights(random.random(size = 100) + 0.5) # default: seaborn with dist_type = "kde" s3_null.weights().plot() """ default_kwargs = { "weighted": False, "library": "seaborn", "dist_type": "kde", "numeric_n_values_threshold": -1, } default_kwargs.update(kwargs) return super().plot(on_linked_samples=on_linked_samples, **default_kwargs)
[docs] def design_effect(self: "BalanceDFWeights") -> np.float64: """Calculates Kish's design effect (deff) on the BalanceDFWeights weights. Extract the first column to get a pd.Series of the weights. See :func:`weights_stats.design_effect` for details. Args: self (BalanceDFWeights): Object. Returns: np.float64: Deff. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( {"id": ["1", "2"], "weight": [1.0, 2.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) round(sample.weights().design_effect(), 3) # 1.111 """ return weights_stats.design_effect(self.df.iloc[:, 0])
[docs] def design_effect_prop(self: "BalanceDFWeights") -> np.float64: """Relative change in design effect: (Deff_adjusted - Deff_unadjusted) / Deff_unadjusted. Returns: np.float64: Relative difference in design effect. Raises: ValueError: If there are no unadjusted weights linked. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( { "id": ["1", "2"], "x": [0, 1], "weight": [1.0, 2.0], } ), id_column="id", weight_column="weight", standardize_types=False, ) target = Sample.from_frame( pd.DataFrame( {"id": ["3", "4"], "x": [0, 1], "weight": [1.0, 1.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) adjusted = sample.set_target(target).adjust(method="null") adjusted.weights().design_effect_prop() """ linked = self._balancedf_child_from_linked_samples() unadjusted_weights = linked.get("unadjusted") if unadjusted_weights is None: raise ValueError( "No unadjusted weights available. This requires an adjusted sample." ) if not isinstance(unadjusted_weights, BalanceDFWeights): raise TypeError("Expected BalanceDFWeights for unadjusted weights.") deff_adjusted = self.design_effect() deff_unadjusted = unadjusted_weights.design_effect() return (deff_adjusted - deff_unadjusted) / deff_unadjusted
[docs] def r_indicator( self: "BalanceDFWeights", target_propensity: float | npt.ArrayLike | None = None, ) -> np.float64: """Approximate the R-indicator using inverse weights as propensities. This is a convenience wrapper around :func:`weighted_comparisons_stats.r_indicator`. Sample response propensities are approximated from the inverse of the current weights. If these inverse weights exceed 1, they are rescaled by their maximum so the derived propensities remain in the valid ``[0, 1]`` interval. When ``target_propensity`` is omitted, the method requires the sample to have a linked target and assumes every target unit has propensity 1. Args: self (BalanceDFWeights): Object. target_propensity (float | npt.ArrayLike | None, optional): Target response propensities to compare against. If ``None``, uses a vector of ones with the linked target's row count. If a scalar is provided, it is broadcast to linked target length. If an array-like is provided and a linked target exists, its length must equal the linked target row count. Returns: np.float64: Approximate R-indicator derived from inverse weights. Raises: ValueError: If the sample has no target and ``target_propensity`` is omitted, if a scalar ``target_propensity`` is provided without a linked target, if array-like target propensities do not match a linked target row count, or if the weights are non-finite / non-positive. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame({"id": [1, 2], "weight": [2.0, 4.0]}), id_column="id", weight_column="weight", standardize_types=False, ) target = Sample.from_frame( pd.DataFrame({"id": [10, 11, 12], "weight": [1.0, 1.0, 1.0]}), id_column="id", weight_column="weight", standardize_types=False, ) round(sample.set_target(target).weights().r_indicator(), 3) # 0.293 """ sample_weights = self.df.iloc[:, 0] if sample_weights.empty: raise ValueError( "BalanceDFWeights.r_indicator requires at least one sample weight" ) if not np.isfinite(sample_weights).all(): raise ValueError("BalanceDFWeights.r_indicator requires finite weights") if (sample_weights <= 0).any(): raise ValueError( "BalanceDFWeights.r_indicator requires strictly positive weights" ) sample_propensity = np.reciprocal(sample_weights.to_numpy(dtype=float)) max_propensity = sample_propensity.max() if max_propensity > 1.0: sample_propensity = sample_propensity / max_propensity # Resolve the target through _resolved_links (same mechanism as # _balancedf_child_from_linked_samples) so that BalanceFrame's custom # links are respected. target_sample = self._resolved_links.get("target") if target_propensity is None: if target_sample is None: raise ValueError( "This Sample does not have a target set. " "Use sample.set_target to add target" ) target_propensity = np.ones(len(target_sample.weight_series), dtype=float) elif np.isscalar(target_propensity): if target_sample is None: raise ValueError( "BalanceDFWeights.r_indicator requires a linked target when " "target_propensity is scalar" ) target_propensity = np.full( len(target_sample.weight_series), # pyrefly: ignore [bad-argument-type] float(target_propensity), dtype=float, ) elif target_sample is not None: target_propensity_array = np.asarray(target_propensity) if target_propensity_array.ndim == 0: target_propensity_array = target_propensity_array.reshape(1) if target_propensity_array.shape[0] != len(target_sample.weight_series): raise ValueError( "BalanceDFWeights.r_indicator requires target_propensity length " "to match linked target row count" ) return weighted_comparisons_stats.r_indicator( sample_propensity, target_propensity )
# TODO: in the future, consider if this type of overriding is the best solution. # to reconsider as part of a larger code refactoring. @property def _weights(self: "BalanceDFWeights") -> None: """A BalanceDFWeights has no weights (its df is that of the weights.) Args: self (BalanceDFWeights): Object. Returns: NoneType: None. """ return None
[docs] def trim( self: "BalanceDFWeights", ratio: float | int | None = None, percentile: float | None = None, keep_sum_of_weights: bool = True, ) -> None: """Trim weights in the backing sample object in-place. Delegates to :meth:`SampleFrame.trim` (or :meth:`BalanceFrame.trim`) with ``inplace=True``, which adds a weight history column and overwrites the active weight column. Args: ratio: Mean-ratio upper bound. Mutually exclusive with *percentile*. percentile: Percentile(s) for winsorization. Mutually exclusive with *ratio*. keep_sum_of_weights: Whether to rescale after trimming to preserve the original sum of weights. Returns: None. Mutates the backing sample's weights in-place. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample sample = Sample.from_frame( pd.DataFrame( {"id": ["1", "2"], "weight": [1.0, 100.0]} ), id_column="id", weight_column="weight", standardize_types=False, ) sample.weights().trim(percentile=0.5, keep_sum_of_weights=False) sample.weights().df["weight"].max() <= 100.0 # True """ self._sample.trim( ratio=ratio, percentile=percentile, keep_sum_of_weights=keep_sum_of_weights, inplace=True, )
[docs] def summary( self: "BalanceDFWeights", on_linked_samples: bool | None = None ) -> pd.DataFrame: """ Generates a summary of a BalanceDFWeights object. This function provides a comprehensive overview of the BalanceDFWeights object by calculating and returning a range of weight diagnostics. Args: self (BalanceDFWeights): The BalanceDFWeights object to be summarized. on_linked_samples (Optional[bool], optional): This parameter is ignored. It is only included because summary overrides BalanceDF.summary. Defaults to None. Returns: pd.DataFrame: A DataFrame containing various weight diagnostics such as 'design_effect', 'effective_sample_proportion', 'effective_sample_size', sum of weights, and basic summary statistics from describe, 'nonparametric_skew', and 'weighted_median_breakdown_point' among others. Note: The weights are normalized to sum to the sample size, n. Examples: .. code-block:: python import pandas as pd from balance.sample_class import Sample s1 = Sample.from_frame( pd.DataFrame( { "a": (1, 2, 3, 1), "b": (-42, 8, 2, -42), "o": (7, 8, 9, 10), "c": ("x", "y", "z", "v"), "id": (1, 2, 3, 4), "w": (0.5, 2, 1, 1), } ), id_column="id", weight_column="w", outcome_columns="o", ) print(s1.weights().summary().round(2)) # var val # 0 design_effect 1.23 # 1 effective_sample_proportion 0.81 # 2 effective_sample_size 3.24 # 3 sum 4.50 # 4 describe_count 4.00 # 5 describe_mean 1.00 # 6 describe_std 0.56 # 7 describe_min 0.44 # 8 describe_25% 0.78 # 9 describe_50% 0.89 # 10 describe_75% 1.11 # 11 describe_max 1.78 # 12 prop(w < 0.1) 0.00 # 13 prop(w < 0.2) 0.00 # 14 prop(w < 0.333) 0.00 # 15 prop(w < 0.5) 0.25 # 16 prop(w < 1) 0.75 # 17 prop(w >= 1) 0.25 # 18 prop(w >= 2) 0.00 # 19 prop(w >= 3) 0.00 # 20 prop(w >= 5) 0.00 # 21 prop(w >= 10) 0.00 # 22 nonparametric_skew 0.20 # 23 weighted_median_breakdown_point 0.25 """ # ---------------------------------------------------- # Diagnostics on the weights # ---------------------------------------------------- the_weights = self.df.iloc[ :, 0 ] # should be ['weight'], but this is more robust in case a user uses other names weights_diag_var = [] weights_diag_value = [] # adding design_effect and variations the_weights_de = weights_stats.design_effect(the_weights) weights_diag_var.extend( ["design_effect", "effective_sample_proportion", "effective_sample_size"] ) weights_diag_value.extend( [ the_weights_de, 1.0 / the_weights_de, float(len(the_weights)) / the_weights_de, ] ) # adding sum of weights, and then normalizing them to n (sample size) weights_diag_var.append("sum") weights_diag_value.append(the_weights.sum()) the_weights = the_weights / the_weights.mean() # normalize weights to sum to n. # adding basic summary statistics from describe: tmp_describe = the_weights.describe() weights_diag_var.extend(["describe_" + i for i in tmp_describe.index]) weights_diag_value.extend(tmp_describe.to_list()) # TODO: decide if we want more quantiles of the weights. # adding prop_above_and_below tmp_props = _assert_type( weights_stats.prop_above_and_below(the_weights), pd.Series ) weights_diag_var.extend(tmp_props.index.to_list()) weights_diag_value.extend(tmp_props.to_list()) # TODO: decide if we want more numbers (e.g.: 2/3 and 3/2) # adding nonparametric_skew and weighted_median_breakdown_point weights_diag_var.append("nonparametric_skew") weights_diag_value.append(weights_stats.nonparametric_skew(the_weights)) weights_diag_var.append("weighted_median_breakdown_point") weights_diag_value.append( weights_stats.weighted_median_breakdown_point(the_weights) ) return pd.DataFrame( { # "metric": "weights_diagnostics", "var": weights_diag_var, "val": weights_diag_value, } )