import warnings
warnings.filterwarnings("ignore")

from balance import load_data

INFO (2025-08-20 19:03:18,518) [__init__/<module> (line 54)]: Using balance version 0.10.0

target_df, sample_df = load_data()

print("target_df: \n", target_df.head())
print("sample_df: \n", sample_df.head())

target_df: 
        id gender age_group     income  happiness
0  100000   Male       45+  10.183951  61.706333
1  100001   Male       45+   6.036858  79.123670
2  100002   Male     35-44   5.226629  44.206949
3  100003    NaN       45+   5.752147  83.985716
4  100004    NaN     25-34   4.837484  49.339713
sample_df: 
   id  gender age_group     income  happiness
0  0    Male     25-34   6.428659  26.043029
1  1  Female     18-24   9.940280  66.885485
2  2    Male     18-24   2.673623  37.091922
3  3     NaN     18-24  10.550308  49.394050
4  4     NaN     18-24   2.689994  72.304208

target_df.head().round(2).to_dict()
# sample_df.shape

{'id': {0: '100000', 1: '100001', 2: '100002', 3: '100003', 4: '100004'},
 'gender': {0: 'Male', 1: 'Male', 2: 'Male', 3: nan, 4: nan},
 'age_group': {0: '45+', 1: '45+', 2: '35-44', 3: '45+', 4: '25-34'},
 'income': {0: 10.18, 1: 6.04, 2: 5.23, 3: 5.75, 4: 4.84},
 'happiness': {0: 61.71, 1: 79.12, 2: 44.21, 3: 83.99, 4: 49.34}}

from balance import Sample

sample = Sample.from_frame(sample_df, outcome_columns=["happiness"])
# Often times we don'y have the outcome for the target. In this case we've added it just to validate later that the weights indeed help us reduce the bias
target = Sample.from_frame(target_df, outcome_columns=["happiness"])

WARNING (2025-08-20 19:03:18,819) [util/guess_id_column (line 113)]: Guessed id column name id for the data

WARNING (2025-08-20 19:03:18,828) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

WARNING (2025-08-20 19:03:18,836) [util/guess_id_column (line 113)]: Guessed id column name id for the data

WARNING (2025-08-20 19:03:18,850) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

sample.df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         1000 non-null   object 
 1   gender     912 non-null    object 
 2   age_group  1000 non-null   object 
 3   income     1000 non-null   float64
 4   happiness  1000 non-null   float64
 5   weight     1000 non-null   float64
dtypes: float64(3), object(3)
memory usage: 47.0+ KB

sample

(balance.sample_class.Sample)

        balance Sample object
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness

target

(balance.sample_class.Sample)

        balance Sample object
        10000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness

sample_with_target = sample.set_target(target)

sample_with_target

(balance.sample_class.Sample)

        balance Sample object with target set
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness
        
            target:
                 
	        balance Sample object
	        10000 observations x 3 variables: gender,age_group,income
	        id_column: id, weight_column: weight,
	        outcome_columns: happiness
	        
            3 common variables: gender,age_group,income

print(sample_with_target.covars().mean().T)

source                     self     target
_is_na_gender[T.True]  0.088000   0.089800
age_group[T.25-34]     0.300000   0.297400
age_group[T.35-44]     0.156000   0.299200
age_group[T.45+]       0.053000   0.206300
gender[Female]         0.268000   0.455100
gender[Male]           0.644000   0.455100
gender[_NA]            0.088000   0.089800
income                 6.297302  12.737608

print(sample_with_target.covars().asmd().T)

source                  self
age_group[T.25-34]  0.005688
age_group[T.35-44]  0.312711
age_group[T.45+]    0.378828
gender[Female]      0.375699
gender[Male]        0.379314
gender[_NA]         0.006296
income              0.494217
mean(asmd)          0.326799

print(sample_with_target.covars().asmd(aggregate_by_main_covar = True).T)

source          self
age_group   0.232409
gender      0.253769
income      0.494217
mean(asmd)  0.326799

sample_with_target.covars().plot()

# Using ipw to fit survey weights
adjusted = sample_with_target.adjust()

INFO (2025-08-20 19:03:21,393) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-08-20 19:03:21,396) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-08-20 19:03:21,397) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['gender', 'age_group', 'income']

INFO (2025-08-20 19:03:21,413) [adjustment/apply_transformations (line 343)]: Final variables in output: ['gender', 'age_group', 'income']

INFO (2025-08-20 19:03:21,423) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-08-20 19:03:21,521) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-08-20 19:03:21,522) [ipw/ipw (line 458)]: The number of columns in the model matrix: 16

INFO (2025-08-20 19:03:21,522) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-08-20 19:03:38,880) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-08-20 19:03:38,880) [ipw/ipw (line 584)]: max_de: None

print(adjusted)

        Adjusted balance Sample object with target set using ipw
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness
        
            target:
                 
	        balance Sample object
	        10000 observations x 3 variables: gender,age_group,income
	        id_column: id, weight_column: weight,
	        outcome_columns: happiness
	        
            3 common variables: gender,age_group,income

print(adjusted.summary())

Covar ASMD reduction: 63.4%, design effect: 1.880
Covar ASMD (7 variables): 0.327 -> 0.119
Model performance: Model proportion deviance explained: 0.173

print(adjusted.covars().mean().T)

source                      self     target  unadjusted
_is_na_gender[T.True]   0.086866   0.089800    0.088000
age_group[T.25-34]      0.307309   0.297400    0.300000
age_group[T.35-44]      0.273676   0.299200    0.156000
age_group[T.45+]        0.137604   0.206300    0.053000
gender[Female]          0.406342   0.455100    0.268000
gender[Male]            0.506792   0.455100    0.644000
gender[_NA]             0.086866   0.089800    0.088000
income                 10.060502  12.737608    6.297302

print(adjusted.covars().asmd().T)

source                  self  unadjusted  unadjusted - self
age_group[T.25-34]  0.021676    0.005688          -0.015988
age_group[T.35-44]  0.055738    0.312711           0.256973
age_group[T.45+]    0.169759    0.378828           0.209069
gender[Female]      0.097907    0.375699           0.277792
gender[Male]        0.103798    0.379314           0.275516
gender[_NA]         0.010260    0.006296          -0.003965
income              0.205436    0.494217           0.288781
mean(asmd)          0.119494    0.326799           0.207304

adjusted.covars().plot()  # you could change sizes using something like .plot(width = 1500, height = 700)

# This shows how we could use seaborn to plot a kernel density estimation
adjusted.covars().plot(library = "seaborn", dist_type = "kde")

adjusted.weights().plot()

# adjusted.weights().design_effect()
print(adjusted.weights().summary().round(2))

                                var       val
0                     design_effect      1.88
1       effective_sample_proportion      0.53
2             effective_sample_size    531.79
3                               sum  10000.00
4                    describe_count   1000.00
5                     describe_mean      1.00
6                      describe_std      0.94
7                      describe_min      0.30
8                      describe_25%      0.45
9                      describe_50%      0.65
10                     describe_75%      1.17
11                     describe_max     11.36
12                    prop(w < 0.1)      0.00
13                    prop(w < 0.2)      0.00
14                  prop(w < 0.333)      0.11
15                    prop(w < 0.5)      0.32
16                      prop(w < 1)      0.67
17                     prop(w >= 1)      0.33
18                     prop(w >= 2)      0.10
19                     prop(w >= 3)      0.03
20                     prop(w >= 5)      0.01
21                    prop(w >= 10)      0.00
22               nonparametric_skew      0.37
23  weighted_median_breakdown_point      0.21

# As we can see, the ci for unadjusted doesn't include the real value in the outcome, while the CI of the adjusted sample does include it.
# Also, the distance from the true value without adjustment is around 4 points, and after adjustment it's around 2 points.
print(adjusted.outcomes().summary())

1 outcomes: ['happiness']
Mean outcomes (with 95% confidence intervals):
source       self  target  unadjusted           self_ci         target_ci     unadjusted_ci
happiness  53.297  56.278      48.559  (52.097, 54.496)  (55.961, 56.595)  (47.669, 49.449)

Response rates (relative to number of respondents in sample):
   happiness
n     1000.0
%      100.0
Response rates (relative to notnull rows in the target):
    happiness
n     1000.0
%       10.0
Response rates (in the target):
    happiness
n    10000.0
%      100.0

adjusted.outcomes().plot()

adjusted.to_download()

# We can prepare the data to be exported as csv - showing the first 500 charaacters for simplicity:
adjusted.to_csv()[0:500]

'id,gender,age_group,income,happiness,weight\n0,Male,25-34,6.428659499046228,26.043028759747298,6.52832077256206\n1,Female,18-24,9.940280228116047,66.88548460632677,9.615962486362896\n2,Male,18-24,2.6736231547518043,37.091921916683006,3.5613441674585165\n3,,18-24,10.550307519418066,49.39405003271002,6.958765976140972\n4,,18-24,2.689993854299385,72.30420755038209,5.1335477016020254\n5,,35-44,5.995497722733131,57.28281646341816,16.44496201550232\n6,,18-24,12.63469573898972,31.663293445944596,8.19816512783'

# Sessions info
import session_info
session_info.show(html=False, dependencies=True)

-----
balance             0.10.0
pandas              2.3.1
session_info        v1.0.1
-----
PIL                         11.3.0
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        25.3.0
attrs                       25.3.0
babel                       2.17.0
certifi                     2025.08.03
charset_normalizer          3.4.3
comm                        0.2.3
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.9.0.post0
debugpy                     1.8.16
decorator                   5.2.1
defusedxml                  0.7.1
exceptiongroup              1.3.0
executing                   2.2.0
fastjsonschema              NA
fqdn                        NA
idna                        3.10
importlib_metadata          NA
importlib_resources         NA
ipfn                        NA
ipykernel                   6.30.1
isoduration                 NA
jedi                        0.19.2
jinja2                      3.1.6
joblib                      1.5.1
json5                       0.12.1
jsonpointer                 3.0.0
jsonschema                  4.25.1
jsonschema_specifications   NA
jupyter_events              0.12.0
jupyter_server              2.16.0
jupyterlab_server           2.27.3
kiwisolver                  1.4.7
lark                        1.2.2
markupsafe                  3.0.2
matplotlib                  3.9.4
matplotlib_inline           0.1.7
mpl_toolkits                NA
narwhals                    2.1.2
nbformat                    5.10.4
numpy                       1.26.4
overrides                   NA
packaging                   25.0
parso                       0.8.4
patsy                       1.0.1
pexpect                     4.9.0
platformdirs                4.3.8
plotly                      6.3.0
prometheus_client           NA
prompt_toolkit              3.0.51
psutil                      7.0.0
ptyprocess                  0.7.0
pure_eval                   0.2.3
pydev_ipython               NA
pydevconsole                NA
pydevd                      3.2.3
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.19.2
pyparsing                   3.2.3
pythonjsonlogger            NA
pytz                        2025.2
referencing                 NA
requests                    2.32.5
rfc3339_validator           0.1.4
rfc3986_validator           0.1.1
rfc3987_syntax              NA
rpds                        NA
scipy                       1.13.1
seaborn                     0.13.2
send2trash                  NA
six                         1.17.0
sklearn                     1.3.2
sniffio                     1.3.1
sphinxcontrib               NA
stack_data                  0.6.3
statsmodels                 0.14.5
threadpoolctl               3.6.0
tornado                     6.5.2
traitlets                   5.14.3
typing_extensions           NA
uri_template                NA
urllib3                     2.5.0
wcwidth                     0.2.13
webcolors                   NA
websocket                   1.8.0
yaml                        6.0.2
zipp                        NA
zmq                         27.0.1
zoneinfo                    NA
-----
IPython             8.18.1
jupyter_client      8.6.3
jupyter_core        5.8.1
jupyterlab          4.4.6
notebook            7.4.5
-----
Python 3.9.23 (main, Jun  4 2025, 04:11:23) [GCC 13.3.0]
Linux-6.11.0-1018-azure-x86_64-with-glibc2.39
-----
Session information updated at 2025-08-20 19:03

balance Quickstart: Analyzing and adjusting the bias on a simulated toy dataset¶

Analysis¶

Example dataset¶

Load data into a Sample object¶

Pre-Adjustment Diagnostics¶

Adjusting Sample to Population¶

Evaluation of the Results¶

Understanding the weights¶

Outcome analysis¶

Downloading data¶