from balance import load_data

INFO (2024-12-06 18:43:13,107) [__init__/<module> (line 54)]: Using balance version 0.9.1

target_df, sample_df = load_data()

print("target_df: \n", target_df.head())
print("sample_df: \n", sample_df.head())

target_df: 
        id gender age_group     income  happiness
0  100000   Male       45+  10.183951  61.706333
1  100001   Male       45+   6.036858  79.123670
2  100002   Male     35-44   5.226629  44.206949
3  100003    NaN       45+   5.752147  83.985716
4  100004    NaN     25-34   4.837484  49.339713
sample_df: 
   id  gender age_group     income  happiness
0  0    Male     25-34   6.428659  26.043029
1  1  Female     18-24   9.940280  66.885485
2  2    Male     18-24   2.673623  37.091922
3  3     NaN     18-24  10.550308  49.394050
4  4     NaN     18-24   2.689994  72.304208

from balance import Sample

sample = Sample.from_frame(sample_df[['id', 'gender', 'age_group', 'happiness']], outcome_columns=["happiness"])
target = Sample.from_frame(target_df[['id', 'gender', 'age_group', 'happiness']], outcome_columns=["happiness"])
sample_with_target = sample.set_target(target)

WARNING (2024-12-06 18:43:13,287) [util/guess_id_column (line 114)]: Guessed id column name id for the data

WARNING (2024-12-06 18:43:13,294) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

WARNING (2024-12-06 18:43:13,303) [util/guess_id_column (line 114)]: Guessed id column name id for the data

WARNING (2024-12-06 18:43:13,315) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

adjusted_ipw = sample_with_target.adjust(method = "ipw")

INFO (2024-12-06 18:43:13,327) [ipw/ipw (line 421)]: Starting ipw function

INFO (2024-12-06 18:43:13,329) [adjustment/apply_transformations (line 306)]: Adding the variables: []

INFO (2024-12-06 18:43:13,330) [adjustment/apply_transformations (line 307)]: Transforming the variables: ['gender', 'age_group']

INFO (2024-12-06 18:43:13,337) [adjustment/apply_transformations (line 347)]: Final variables in output: ['gender', 'age_group']

INFO (2024-12-06 18:43:13,341) [ipw/ipw (line 455)]: Building model matrix

INFO (2024-12-06 18:43:13,394) [ipw/ipw (line 479)]: The formula used to build the model matrix: ['gender + age_group + _is_na_gender']

INFO (2024-12-06 18:43:13,395) [ipw/ipw (line 482)]: The number of columns in the model matrix: 7

INFO (2024-12-06 18:43:13,395) [ipw/ipw (line 483)]: The number of rows in the model matrix: 11000

INFO (2024-12-06 18:43:13,402) [ipw/ipw (line 514)]: Fitting logistic model

INFO (2024-12-06 18:43:14,465) [ipw/ipw (line 555)]: max_de: None

adjusted_rake = sample_with_target.adjust(method = "rake")

INFO (2024-12-06 18:43:14,484) [adjustment/apply_transformations (line 306)]: Adding the variables: []

INFO (2024-12-06 18:43:14,485) [adjustment/apply_transformations (line 307)]: Transforming the variables: ['gender', 'age_group']

INFO (2024-12-06 18:43:14,496) [adjustment/apply_transformations (line 347)]: Final variables in output: ['gender', 'age_group']

INFO (2024-12-06 18:43:14,509) [rake/rake (line 158)]: Final covariates and levels that will be used in raking: {'gender': ['Male', 'Female', '__NaN__'], 'age_group': ['18-24', '25-34', '45+', '35-44']}.

ipfn converged: convergence_rate below threshold

print(adjusted_ipw.summary())

Covar ASMD reduction: 71.7%, design effect: 1.532
Covar ASMD (6 variables): 0.243 -> 0.069
Model performance: Model proportion deviance explained: 0.115

print(adjusted_rake.summary())

Covar ASMD reduction: 100.0%, design effect: 2.103
Covar ASMD (6 variables): 0.243 -> 0.000

adjusted_ipw.covars().plot()

adjusted_rake.covars().plot()

from balance.weighting_methods.rake import prepare_marginal_dist_for_raking
# import pandas as pd
import numpy as np

a_dict_with_marginal_distributions = {"gender": {"Female": 0.1, "Male": 0.85, np.nan: 0.05}, "age_group": {"18-24": 0.25, "25-34": 0.25, "35-44": 0.25, "45+": 0.25}}

target_df_from_marginals = prepare_marginal_dist_for_raking(a_dict_with_marginal_distributions)

target_df_from_marginals

target_df_from_marginals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   gender     19 non-null     object
 1   age_group  20 non-null     object
 2   id         20 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 608.0+ bytes

target_from_marginals = Sample.from_frame(target_df_from_marginals)
sample_with_target_2 = sample.set_target(target_from_marginals)

WARNING (2024-12-06 18:43:15,433) [util/guess_id_column (line 114)]: Guessed id column name id for the data

WARNING (2024-12-06 18:43:15,434) [sample_class/from_frame (line 190)]: Casting id column to string

WARNING (2024-12-06 18:43:15,440) [util/_warn_of_df_dtypes_change (line 1839)]: The dtypes of sample._df were changed from the original dtypes of the input df, here are the differences -

WARNING (2024-12-06 18:43:15,441) [util/_warn_of_df_dtypes_change (line 1848)]: The (old) dtypes that changed for df (before the change):

WARNING (2024-12-06 18:43:15,442) [util/_warn_of_df_dtypes_change (line 1851)]: 
id    int64
dtype: object

WARNING (2024-12-06 18:43:15,442) [util/_warn_of_df_dtypes_change (line 1852)]: The (new) dtypes saved in df (after the change):

WARNING (2024-12-06 18:43:15,443) [util/_warn_of_df_dtypes_change (line 1853)]: 
id    object
dtype: object

WARNING (2024-12-06 18:43:15,444) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

adjusted_rake_2 = sample_with_target_2.adjust(method = "rake")

INFO (2024-12-06 18:43:15,455) [adjustment/apply_transformations (line 306)]: Adding the variables: []

INFO (2024-12-06 18:43:15,456) [adjustment/apply_transformations (line 307)]: Transforming the variables: ['gender', 'age_group']

INFO (2024-12-06 18:43:15,459) [adjustment/apply_transformations (line 347)]: Final variables in output: ['gender', 'age_group']

INFO (2024-12-06 18:43:15,463) [rake/rake (line 158)]: Final covariates and levels that will be used in raking: {'gender': ['Male', 'Female', '__NaN__'], 'age_group': ['18-24', '25-34', '45+', '35-44']}.

ipfn converged: convergence_rate below threshold

print(adjusted_rake_2.summary())

Covar ASMD reduction: 100.0%, design effect: 2.176
Covar ASMD (6 variables): 0.341 -> 0.000

adjusted_rake_2.covars().plot()

balance Quickstart (raking): Analyzing and adjusting the bias on a simulated toy dataset¶

Load the data¶

Fit models using ipw and rake¶

Using marginal distribution with rake¶

	gender	age_group	id
0	Female	18-24	0
1	Female	25-34	1
2	Male	35-44	2
3	Male	45+	3
4	Male	18-24	4
5	Male	25-34	5
6	Male	35-44	6
7	Male	45+	7
8	Male	18-24	8
9	Male	25-34	9
10	Male	35-44	10
11	Male	45+	11
12	Male	18-24	12
13	Male	25-34	13
14	Male	35-44	14
15	Male	45+	15
16	Male	18-24	16
17	Male	25-34	17
18	Male	35-44	18
19	NaN	45+	19