from balance import load_data

INFO (2025-08-20 19:03:47,440) [__init__/<module> (line 54)]: Using balance version 0.10.0

target_df, sample_df = load_data()

print("target_df: \n", target_df.head())
print("sample_df: \n", sample_df.head())

target_df: 
        id gender age_group     income  happiness
0  100000   Male       45+  10.183951  61.706333
1  100001   Male       45+   6.036858  79.123670
2  100002   Male     35-44   5.226629  44.206949
3  100003    NaN       45+   5.752147  83.985716
4  100004    NaN     25-34   4.837484  49.339713
sample_df: 
   id  gender age_group     income  happiness
0  0    Male     25-34   6.428659  26.043029
1  1  Female     18-24   9.940280  66.885485
2  2    Male     18-24   2.673623  37.091922
3  3     NaN     18-24  10.550308  49.394050
4  4     NaN     18-24   2.689994  72.304208

target_df.head().round(2).to_dict()
# sample_df.shape

{'id': {0: '100000', 1: '100001', 2: '100002', 3: '100003', 4: '100004'},
 'gender': {0: 'Male', 1: 'Male', 2: 'Male', 3: nan, 4: nan},
 'age_group': {0: '45+', 1: '45+', 2: '35-44', 3: '45+', 4: '25-34'},
 'income': {0: 10.18, 1: 6.04, 2: 5.23, 3: 5.75, 4: 4.84},
 'happiness': {0: 61.71, 1: 79.12, 2: 44.21, 3: 83.99, 4: 49.34}}

from balance import Sample

sample = Sample.from_frame(sample_df, outcome_columns=["happiness"])
target = Sample.from_frame(target_df, outcome_columns=["happiness"])

WARNING (2025-08-20 19:03:47,693) [util/guess_id_column (line 113)]: Guessed id column name id for the data

WARNING (2025-08-20 19:03:47,702) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

WARNING (2025-08-20 19:03:47,711) [util/guess_id_column (line 113)]: Guessed id column name id for the data

WARNING (2025-08-20 19:03:47,726) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

sample.df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         1000 non-null   object 
 1   gender     912 non-null    object 
 2   age_group  1000 non-null   object 
 3   income     1000 non-null   float64
 4   happiness  1000 non-null   float64
 5   weight     1000 non-null   float64
dtypes: float64(3), object(3)
memory usage: 47.0+ KB

sample

(balance.sample_class.Sample)

        balance Sample object
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness

target

(balance.sample_class.Sample)

        balance Sample object
        10000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness

sample_with_target = sample.set_target(target)

sample_with_target

(balance.sample_class.Sample)

        balance Sample object with target set
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness
        
            target:
                 
	        balance Sample object
	        10000 observations x 3 variables: gender,age_group,income
	        id_column: id, weight_column: weight,
	        outcome_columns: happiness
	        
            3 common variables: gender,age_group,income

print(sample_with_target.covars().mean().T)

source                     self     target
_is_na_gender[T.True]  0.088000   0.089800
age_group[T.25-34]     0.300000   0.297400
age_group[T.35-44]     0.156000   0.299200
age_group[T.45+]       0.053000   0.206300
gender[Female]         0.268000   0.455100
gender[Male]           0.644000   0.455100
gender[_NA]            0.088000   0.089800
income                 6.297302  12.737608

print(sample_with_target.covars().asmd().T)

source                  self
age_group[T.25-34]  0.005688
age_group[T.35-44]  0.312711
age_group[T.45+]    0.378828
gender[Female]      0.375699
gender[Male]        0.379314
gender[_NA]         0.006296
income              0.494217
mean(asmd)          0.326799

print(sample_with_target.covars().asmd(aggregate_by_main_covar = True).T)

source          self
age_group   0.232409
gender      0.253769
income      0.494217
mean(asmd)  0.326799

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_comparisons_stats.py:355: FutureWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

sample_with_target.covars().plot()

# Using ipw to fit survey weights
adjusted_ipw = sample_with_target.adjust()

INFO (2025-08-20 19:03:50,261) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-08-20 19:03:50,264) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-08-20 19:03:50,264) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['gender', 'age_group', 'income']

INFO (2025-08-20 19:03:50,280) [adjustment/apply_transformations (line 343)]: Final variables in output: ['gender', 'age_group', 'income']

INFO (2025-08-20 19:03:50,294) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-08-20 19:03:50,394) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-08-20 19:03:50,395) [ipw/ipw (line 458)]: The number of columns in the model matrix: 16

INFO (2025-08-20 19:03:50,396) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-08-20 19:04:07,498) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-08-20 19:04:07,499) [ipw/ipw (line 584)]: max_de: None

adjusted_cbps = sample_with_target.adjust(method = "cbps")

INFO (2025-08-20 19:04:07,514) [cbps/cbps (line 411)]: Starting cbps function

INFO (2025-08-20 19:04:07,517) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-08-20 19:04:07,517) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['gender', 'age_group', 'income']

INFO (2025-08-20 19:04:07,526) [adjustment/apply_transformations (line 343)]: Final variables in output: ['gender', 'age_group', 'income']

INFO (2025-08-20 19:04:07,634) [cbps/cbps (line 461)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-08-20 19:04:07,637) [cbps/cbps (line 473)]: The number of columns in the model matrix: 16

INFO (2025-08-20 19:04:07,637) [cbps/cbps (line 474)]: The number of rows in the model matrix: 11000

INFO (2025-08-20 19:04:07,644) [cbps/cbps (line 543)]: Finding initial estimator for GMM optimization

INFO (2025-08-20 19:04:07,813) [cbps/cbps (line 570)]: Finding initial estimator for GMM optimization that minimizes the balance loss

INFO (2025-08-20 19:04:08,061) [cbps/cbps (line 605)]: Running GMM optimization

print(adjusted_ipw)

        Adjusted balance Sample object with target set using ipw
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness
        
            target:
                 
	        balance Sample object
	        10000 observations x 3 variables: gender,age_group,income
	        id_column: id, weight_column: weight,
	        outcome_columns: happiness
	        
            3 common variables: gender,age_group,income

# the adjusted object will look the same as ipw 
print(adjusted_cbps)

        Adjusted balance Sample object with target set using cbps
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness
        
            target:
                 
	        balance Sample object
	        10000 observations x 3 variables: gender,age_group,income
	        id_column: id, weight_column: weight,
	        outcome_columns: happiness
	        
            3 common variables: gender,age_group,income

print(adjusted_ipw.summary())

Covar ASMD reduction: 63.4%, design effect: 1.880
Covar ASMD (7 variables): 0.327 -> 0.119
Model performance: Model proportion deviance explained: 0.173

print(adjusted_cbps.summary())

Covar ASMD reduction: 77.4%, design effect: 2.754
Covar ASMD (7 variables): 0.327 -> 0.074

print("ipw:")
print(adjusted_ipw.covars().asmd().T)
print("\ncbps:")
print(adjusted_cbps.covars().asmd().T)

ipw:
source                  self  unadjusted  unadjusted - self
age_group[T.25-34]  0.021676    0.005688          -0.015988
age_group[T.35-44]  0.055738    0.312711           0.256973
age_group[T.45+]    0.169759    0.378828           0.209069
gender[Female]      0.097907    0.375699           0.277792
gender[Male]        0.103798    0.379314           0.275516
gender[_NA]         0.010260    0.006296          -0.003965
income              0.205436    0.494217           0.288781
mean(asmd)          0.119494    0.326799           0.207304

cbps:

source                  self  unadjusted  unadjusted - self
age_group[T.25-34]  0.056633    0.005688          -0.050945
age_group[T.35-44]  0.028616    0.312711           0.284096
age_group[T.45+]    0.107171    0.378828           0.271657
gender[Female]      0.034850    0.375699           0.340849
gender[Male]        0.057813    0.379314           0.321501
gender[_NA]         0.039998    0.006296          -0.033702
income              0.113017    0.494217           0.381200
mean(asmd)          0.073792    0.326799           0.253006

adjusted_ipw.covars().plot(library = "seaborn", dist_type = "kde")

adjusted_cbps.covars().plot(library = "seaborn", dist_type = "kde")

print("ipw:")
print(adjusted_ipw.weights().design_effect())
print("\ncbps:")
print(adjusted_cbps.weights().design_effect())

ipw:
1.880451176987643

cbps:
2.7543106310582797

print(adjusted_ipw.outcomes().summary())
adjusted_ipw.outcomes().plot()

1 outcomes: ['happiness']
Mean outcomes (with 95% confidence intervals):
source       self  target  unadjusted           self_ci         target_ci     unadjusted_ci
happiness  53.297  56.278      48.559  (52.097, 54.496)  (55.961, 56.595)  (47.669, 49.449)

Response rates (relative to number of respondents in sample):
   happiness
n     1000.0
%      100.0
Response rates (relative to notnull rows in the target):
    happiness
n     1000.0
%       10.0
Response rates (in the target):
    happiness
n    10000.0
%      100.0

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:300: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:301: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:300: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:301: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:300: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:301: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

print(adjusted_cbps.outcomes().summary())
adjusted_cbps.outcomes().plot()

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:300: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:301: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:300: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:301: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:300: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/balance/stats_and_plots/weighted_stats.py:301: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

1 outcomes: ['happiness']
Mean outcomes (with 95% confidence intervals):
source       self  target  unadjusted          self_ci         target_ci     unadjusted_ci
happiness  54.366  56.278      48.559  (53.003, 55.73)  (55.961, 56.595)  (47.669, 49.449)

Response rates (relative to number of respondents in sample):
   happiness
n     1000.0
%      100.0
Response rates (relative to notnull rows in the target):
    happiness
n     1000.0
%       10.0
Response rates (in the target):
    happiness
n    10000.0
%      100.0

adjusted_cbps.to_download()

# We can prepare the data to be exported as csv - showing the first 500 charaacters for simplicity:
adjusted_cbps.to_csv()[0:500]

'id,gender,age_group,income,happiness,weight\n0,Male,25-34,6.428659499046228,26.043028759747298,6.0679294218186355\n1,Female,18-24,9.940280228116047,66.88548460632677,7.268404955752517\n2,Male,18-24,2.6736231547518043,37.091921916683006,2.2276955465207298\n3,,18-24,10.550307519418066,49.39405003271002,4.657678938710531\n4,,18-24,2.689993854299385,72.30420755038209,3.26107158669341\n5,,35-44,5.995497722733131,57.28281646341816,17.090821719974272\n6,,18-24,12.63469573898972,31.663293445944596,5.4437024322'

# Sessions info
import session_info
session_info.show(html=False, dependencies=True)

-----
balance             0.10.0
pandas              2.3.1
session_info        v1.0.1
-----
PIL                         11.3.0
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        25.3.0
attrs                       25.3.0
babel                       2.17.0
certifi                     2025.08.03
charset_normalizer          3.4.3
comm                        0.2.3
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.9.0.post0
debugpy                     1.8.16
decorator                   5.2.1
defusedxml                  0.7.1
exceptiongroup              1.3.0
executing                   2.2.0
fastjsonschema              NA
fqdn                        NA
idna                        3.10
importlib_metadata          NA
importlib_resources         NA
ipfn                        NA
ipykernel                   6.30.1
isoduration                 NA
jedi                        0.19.2
jinja2                      3.1.6
joblib                      1.5.1
json5                       0.12.1
jsonpointer                 3.0.0
jsonschema                  4.25.1
jsonschema_specifications   NA
jupyter_events              0.12.0
jupyter_server              2.16.0
jupyterlab_server           2.27.3
kiwisolver                  1.4.7
lark                        1.2.2
markupsafe                  3.0.2
matplotlib                  3.9.4
matplotlib_inline           0.1.7
mpl_toolkits                NA
narwhals                    2.1.2
nbformat                    5.10.4
numpy                       1.26.4
overrides                   NA
packaging                   25.0
parso                       0.8.4
patsy                       1.0.1
pexpect                     4.9.0
platformdirs                4.3.8
plotly                      6.3.0
prometheus_client           NA
prompt_toolkit              3.0.51
psutil                      7.0.0
ptyprocess                  0.7.0
pure_eval                   0.2.3
pydev_ipython               NA
pydevconsole                NA
pydevd                      3.2.3
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.19.2
pyparsing                   3.2.3
pythonjsonlogger            NA
pytz                        2025.2
referencing                 NA
requests                    2.32.5
rfc3339_validator           0.1.4
rfc3986_validator           0.1.1
rfc3987_syntax              NA
rpds                        NA
scipy                       1.13.1
seaborn                     0.13.2
send2trash                  NA
six                         1.17.0
sklearn                     1.3.2
sniffio                     1.3.1
sphinxcontrib               NA
stack_data                  0.6.3
statsmodels                 0.14.5
threadpoolctl               3.6.0
tornado                     6.5.2
traitlets                   5.14.3
typing_extensions           NA
uri_template                NA
urllib3                     2.5.0
wcwidth                     0.2.13
webcolors                   NA
websocket                   1.8.0
yaml                        6.0.2
zipp                        NA
zmq                         27.0.1
zoneinfo                    NA
-----
IPython             8.18.1
jupyter_client      8.6.3
jupyter_core        5.8.1
jupyterlab          4.4.6
notebook            7.4.5
-----
Python 3.9.23 (main, Jun  4 2025, 04:11:23) [GCC 13.3.0]
Linux-6.11.0-1018-azure-x86_64-with-glibc2.39
-----
Session information updated at 2025-08-20 19:04

/opt/hostedtoolcache/Python/3.9.23/x64/lib/python3.9/site-packages/session_info/main.py:213: UserWarning:

The '__version__' attribute is deprecated and will be removed in MarkupSafe 3.1. Use feature detection, or `importlib.metadata.version("markupsafe")`, instead.

balance Quickstart (CBPS): Analyzing and adjusting the bias on a simulated toy dataset¶

Analysis¶

Example dataset¶

Load data into a Sample object¶

Pre-Adjustment Diagnostics¶

Adjusting Sample to Population (ipw and cbps)¶

Evaluation of the Results (CBPS vs IPW)¶

Understanding the weights¶

Outcome analysis¶

Downloading data¶