from balance import load_data
target_df, sample_df = load_data()
from balance import Sample
sample = Sample.from_frame(sample_df, outcome_columns=["happiness"])
target = Sample.from_frame(target_df, outcome_columns=["happiness"])
sample_with_target = sample.set_target(target)
sample_with_target

INFO (2025-05-23 17:45:10,813) [__init__/<module> (line 54)]: Using balance version 0.10.0

WARNING (2025-05-23 17:45:11,000) [util/guess_id_column (line 113)]: Guessed id column name id for the data

WARNING (2025-05-23 17:45:11,009) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

WARNING (2025-05-23 17:45:11,017) [util/guess_id_column (line 113)]: Guessed id column name id for the data

WARNING (2025-05-23 17:45:11,030) [sample_class/from_frame (line 261)]: No weights passed. Adding a 'weight' column and setting all values to 1

(balance.sample_class.Sample)

        balance Sample object with target set
        1000 observations x 3 variables: gender,age_group,income
        id_column: id, weight_column: weight,
        outcome_columns: happiness
        
            target:
                 
	        balance Sample object
	        10000 observations x 3 variables: gender,age_group,income
	        id_column: id, weight_column: weight,
	        outcome_columns: happiness
	        
            3 common variables: gender,age_group,income

adjusted = sample_with_target.adjust(
    # method="ipw", # default method
    # transformations=None,
    # formula=None,
    # penalty_factor=None, # all 1s
    # max_de=None,
)
adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:45:11,048) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:45:11,051) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:45:11,052) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['gender', 'age_group', 'income']

INFO (2025-05-23 17:45:11,062) [adjustment/apply_transformations (line 343)]: Final variables in output: ['gender', 'age_group', 'income']

INFO (2025-05-23 17:45:11,068) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:45:11,163) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-05-23 17:45:11,164) [ipw/ipw (line 458)]: The number of columns in the model matrix: 16

INFO (2025-05-23 17:45:11,165) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:45:27,752) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:45:27,753) [ipw/ipw (line 584)]: max_de: None

adjusted = sample_with_target.adjust(
    # method="ipw",
    transformations=None,
    # formula=formula,
    # penalty_factor=penalty_factor,
    # max_de=None,
)
adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:45:28,045) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:45:28,046) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:45:28,112) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-05-23 17:45:28,113) [ipw/ipw (line 458)]: The number of columns in the model matrix: 8

INFO (2025-05-23 17:45:28,113) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:45:43,481) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:45:43,482) [ipw/ipw (line 584)]: max_de: None

INFO (2025-05-23 17:45:43,482) [ipw/ipw (line 605)]: Starting model selection

INFO (2025-05-23 17:45:43,485) [ipw/ipw (line 638)]: Chosen lambda: 0.0368353720078807

INFO (2025-05-23 17:45:43,486) [ipw/ipw (line 654)]: Proportion null deviance explained 0.17353587606008936

from balance.util import fct_lump, quantize

transformations = {
    "age_group": lambda x: fct_lump(x, 0.25),
    "gender": lambda x: x,
    "income": lambda x: quantize(x.fillna(x.mean()), q=3),
}

adjusted = sample_with_target.adjust(
    # method="ipw",
    transformations=transformations,
    # formula=formula,
    # penalty_factor=penalty_factor,
    # max_de=None,
)
adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:45:43,769) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:45:43,772) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:45:43,773) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:45:43,779) [adjustment/apply_transformations (line 343)]: Final variables in output: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:45:43,785) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:45:43,880) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-05-23 17:45:43,881) [ipw/ipw (line 458)]: The number of columns in the model matrix: 8

INFO (2025-05-23 17:45:43,881) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:45:57,649) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:45:57,650) [ipw/ipw (line 584)]: max_de: None

transformations = {
    # "age_group": lambda x: fct_lump(x, 0.25),
    "gender": lambda x: x,
    # "income": lambda x: quantize(x.fillna(x.mean()), q=3),
}

adjusted = sample_with_target.adjust(
    # method="ipw",
    transformations=transformations,
    # formula=formula,
    # penalty_factor=penalty_factor,
    # max_de=None,
)
adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:45:57,939) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:45:57,942) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:45:57,943) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['gender']

WARNING (2025-05-23 17:45:57,944) [adjustment/apply_transformations (line 340)]: Dropping the variables: ['income', 'age_group']

INFO (2025-05-23 17:45:57,944) [adjustment/apply_transformations (line 343)]: Final variables in output: ['gender']

INFO (2025-05-23 17:45:57,947) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:45:57,983) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['gender + _is_na_gender']

INFO (2025-05-23 17:45:57,984) [ipw/ipw (line 458)]: The number of columns in the model matrix: 4

INFO (2025-05-23 17:45:57,984) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:46:06,699) [ipw/ipw (line 578)]: Done with sklearn

# TODO: add more examples about how add_na works
# TODO: add more examples about rare values in categorical variables and how they are grouped together.

from balance.util import fct_lump, quantize

transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    "income": lambda x: x,
    "income_squared": lambda x: x.income**2,
    "income_buckets": lambda x: quantize(x.income.fillna(x.income.mean()), q=3),
}

adjusted = sample_with_target.adjust(
    # method="ipw",
    transformations=transformations,
    # formula=formula,
    # penalty_factor=penalty_factor,
    # max_de=None,
)
adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:46:06,995) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:46:06,998) [adjustment/apply_transformations (line 305)]: Adding the variables: ['income_squared', 'income_buckets']

INFO (2025-05-23 17:46:06,998) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:46:07,004) [adjustment/apply_transformations (line 343)]: Final variables in output: ['income_squared', 'income_buckets', 'age_group', 'gender', 'income']

INFO (2025-05-23 17:46:07,015) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:46:07,114) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income_squared + income_buckets + income + gender + age_group + _is_na_gender']

INFO (2025-05-23 17:46:07,115) [ipw/ipw (line 458)]: The number of columns in the model matrix: 11

INFO (2025-05-23 17:46:07,116) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:46:28,823) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:46:28,824) [ipw/ipw (line 584)]: max_de: None

from balance.util import fct_lump_by, quantize

transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    "income": lambda x: quantize(x.fillna(x.mean()), q=20),
}
formula = ["age_group * gender"]
# the penalty is per elemnt in the list of formula:
# penalty_factor = [0.1, 0.1, 0.1]

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=transformations,
    formula=formula,
    # penalty_factor=penalty_factor,
    # max_de=None,
)

adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:46:29,112) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:46:29,115) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:46:29,115) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:46:29,120) [adjustment/apply_transformations (line 343)]: Final variables in output: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:46:29,127) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:46:29,190) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['age_group * gender']

INFO (2025-05-23 17:46:29,191) [ipw/ipw (line 458)]: The number of columns in the model matrix: 12

INFO (2025-05-23 17:46:29,192) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:46:46,900) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:46:46,901) [ipw/ipw (line 584)]: max_de: None

transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    "income": lambda x: x,
}
formula = ["age_group + gender", "income"]
# the penalty is per elemnt in the list of formula:
penalty_factor = [10, 0.1]

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=transformations,
    formula=formula,
    penalty_factor=penalty_factor,
    # max_de=None,
)

adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:46:47,194) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:46:47,197) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:46:47,198) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:46:47,199) [adjustment/apply_transformations (line 343)]: Final variables in output: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:46:47,205) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:46:47,271) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['age_group + gender', 'income']

INFO (2025-05-23 17:46:47,272) [ipw/ipw (line 458)]: The number of columns in the model matrix: 7

INFO (2025-05-23 17:46:47,273) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:47:15,552) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:47:15,553) [ipw/ipw (line 584)]: max_de: None

transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    "income": lambda x: x,
}
formula = ["age_group + gender", "income"]
# the penalty is per elemnt in the list of formula:
penalty_factor = [0.1, 10]  # this is flipped

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=transformations,
    formula=formula,
    penalty_factor=penalty_factor,
    # max_de=None,
)

adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:47:15,845) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:47:15,848) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:47:15,849) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:47:15,850) [adjustment/apply_transformations (line 343)]: Final variables in output: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:47:15,856) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:47:15,922) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['age_group + gender', 'income']

INFO (2025-05-23 17:47:15,923) [ipw/ipw (line 458)]: The number of columns in the model matrix: 7

INFO (2025-05-23 17:47:15,924) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:47:42,270) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:47:42,270) [ipw/ipw (line 584)]: max_de: None

from balance.util import fct_lump_by, quantize

transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    "income": lambda x: x,
    "income_buckets": lambda x: quantize(x.income.fillna(x.income.mean()), q=4),
}
formula = ["age_group + gender", "income", "income_buckets"]
# the penalty is per elemnt in the list of formula:
penalty_factor = [1, 2, 2]

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=transformations,
    formula=formula,
    penalty_factor=penalty_factor,
    # max_de=None,
)

adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:47:42,552) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:47:42,555) [adjustment/apply_transformations (line 305)]: Adding the variables: ['income_buckets']

INFO (2025-05-23 17:47:42,556) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:47:42,561) [adjustment/apply_transformations (line 343)]: Final variables in output: ['income_buckets', 'age_group', 'gender', 'income']

INFO (2025-05-23 17:47:42,569) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:47:42,665) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['age_group + gender', 'income', 'income_buckets']

INFO (2025-05-23 17:47:42,666) [ipw/ipw (line 458)]: The number of columns in the model matrix: 11

INFO (2025-05-23 17:47:42,667) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:48:02,153) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:48:02,154) [ipw/ipw (line 584)]: max_de: None

from balance.util import fct_lump_by, quantize

transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    "income": lambda x: x,
    "income_buckets": lambda x: quantize(x.income.fillna(x.income.mean()), q=4),
}
formula = ["age_group", "gender", "income + income_buckets"]
# the penalty is per elemnt in the list of formula:
penalty_factor = [1, 1, 1]

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=transformations,
    formula=formula,
    penalty_factor=penalty_factor,
    # max_de=None,
)

adj_diag = adjusted.diagnostics()
adj_diag.query("metric == 'model_coef'")

INFO (2025-05-23 17:48:02,440) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:48:02,443) [adjustment/apply_transformations (line 305)]: Adding the variables: ['income_buckets']

INFO (2025-05-23 17:48:02,443) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:48:02,448) [adjustment/apply_transformations (line 343)]: Final variables in output: ['income_buckets', 'age_group', 'gender', 'income']

INFO (2025-05-23 17:48:02,457) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:48:02,556) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['age_group', 'gender', 'income + income_buckets']

INFO (2025-05-23 17:48:02,556) [ipw/ipw (line 458)]: The number of columns in the model matrix: 12

INFO (2025-05-23 17:48:02,557) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:48:18,218) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:48:18,218) [ipw/ipw (line 584)]: max_de: None

# Defaults from the package

adjusted = sample_with_target.adjust(
    # max_de=None,
)

print(adjusted.summary())
print(adjusted.outcomes().summary())
adjusted.covars().plot(library = "seaborn", dist_type = "kde")

INFO (2025-05-23 17:48:18,501) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:48:18,503) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:48:18,504) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['gender', 'age_group', 'income']

INFO (2025-05-23 17:48:18,513) [adjustment/apply_transformations (line 343)]: Final variables in output: ['gender', 'age_group', 'income']

INFO (2025-05-23 17:48:18,519) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:48:18,613) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-05-23 17:48:18,613) [ipw/ipw (line 458)]: The number of columns in the model matrix: 16

INFO (2025-05-23 17:48:18,614) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:48:35,084) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:48:35,085) [ipw/ipw (line 584)]: max_de: None

# No transformations at all

# transformations = None is just like using:
# transformations = {
#     "age_group": lambda x: x,
#     "gender": lambda x: x,
#     "income": lambda x: x,
# }

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=None,
    # formula=formula,
    # penalty_factor=penalty_factor,
    # max_de=None,
)

print(adjusted.summary())
print(adjusted.outcomes().summary())
adjusted.covars().plot(library = "seaborn", dist_type = "kde")

# slightly smaller design effect, slightly better ASMD reduction.

INFO (2025-05-23 17:48:36,070) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:48:36,072) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:48:36,138) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-05-23 17:48:36,138) [ipw/ipw (line 458)]: The number of columns in the model matrix: 8

INFO (2025-05-23 17:48:36,139) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:48:51,509) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:48:51,510) [ipw/ipw (line 584)]: max_de: None

INFO (2025-05-23 17:48:51,511) [ipw/ipw (line 605)]: Starting model selection

INFO (2025-05-23 17:48:51,513) [ipw/ipw (line 638)]: Chosen lambda: 0.0368353720078807

INFO (2025-05-23 17:48:51,514) [ipw/ipw (line 654)]: Proportion null deviance explained 0.17353587606008936

# No transformations at all
transformations = None
# But passing a squared term of income to the formula:
formula = ["age_group + gender + income + income**2"]
# the penalty is per elemnt in the list of formula:
# penalty_factor = [1]

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=transformations,
    formula=formula,
    # penalty_factor=penalty_factor,
    # max_de=None,
)

print(adjusted.summary())
print(adjusted.outcomes().summary())
adjusted.covars().plot(library = "seaborn", dist_type = "kde")

# Adding income**2 to the formula led to lower Deff but also lower ASMD reduction.

INFO (2025-05-23 17:48:52,487) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:48:52,489) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:48:52,555) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['age_group + gender + income + income**2']

INFO (2025-05-23 17:48:52,556) [ipw/ipw (line 458)]: The number of columns in the model matrix: 7

INFO (2025-05-23 17:48:52,556) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:49:07,262) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:49:07,263) [ipw/ipw (line 584)]: max_de: None

INFO (2025-05-23 17:49:07,264) [ipw/ipw (line 605)]: Starting model selection

INFO (2025-05-23 17:49:07,267) [ipw/ipw (line 638)]: Chosen lambda: 0.0574164245593571

INFO (2025-05-23 17:49:07,267) [ipw/ipw (line 654)]: Proportion null deviance explained 0.17296221715396187

transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    "income": lambda x: x,
    "income_buckets": lambda x: quantize(x.income.fillna(x.income.mean()), q=20),
}
formula = ["age_group + gender", "income_buckets"]
# the penalty is per elemnt in the list of formula:
penalty_factor = [1, 0.1]

adjusted = sample_with_target.adjust(
    method="ipw",
    transformations=transformations,
    formula=formula,
    penalty_factor=penalty_factor,
    # max_de=None,
)

print(adjusted.summary())
print(adjusted.outcomes().summary())
adjusted.covars().plot(library = "seaborn", dist_type = "kde")

# By adding income_buckets and using it instead of income, as well as putting more weight in it in terms of penalty
# we managed to correct income quite well, but at the expense of age and gender.

INFO (2025-05-23 17:49:08,300) [ipw/ipw (line 399)]: Starting ipw function

INFO (2025-05-23 17:49:08,303) [adjustment/apply_transformations (line 305)]: Adding the variables: ['income_buckets']

INFO (2025-05-23 17:49:08,304) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender', 'income']

INFO (2025-05-23 17:49:08,310) [adjustment/apply_transformations (line 343)]: Final variables in output: ['income_buckets', 'age_group', 'gender', 'income']

INFO (2025-05-23 17:49:08,318) [ipw/ipw (line 433)]: Building model matrix

INFO (2025-05-23 17:49:08,415) [ipw/ipw (line 455)]: The formula used to build the model matrix: ['age_group + gender', 'income_buckets']

INFO (2025-05-23 17:49:08,416) [ipw/ipw (line 458)]: The number of columns in the model matrix: 26

INFO (2025-05-23 17:49:08,416) [ipw/ipw (line 459)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:49:30,971) [ipw/ipw (line 578)]: Done with sklearn

INFO (2025-05-23 17:49:30,972) [ipw/ipw (line 584)]: max_de: None

# Defaults from the package

adjusted = sample_with_target.adjust(
    method = "cbps",
    # max_de=None,
)

print(adjusted.summary())
print(adjusted.outcomes().summary())
adjusted.covars().plot(library = "seaborn", dist_type = "kde")

# CBPS already corrects a lot. Let's see if we can make it correct a tiny bit more.

INFO (2025-05-23 17:49:31,914) [cbps/cbps (line 411)]: Starting cbps function

INFO (2025-05-23 17:49:31,917) [adjustment/apply_transformations (line 305)]: Adding the variables: []

INFO (2025-05-23 17:49:31,918) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['gender', 'age_group', 'income']

INFO (2025-05-23 17:49:31,927) [adjustment/apply_transformations (line 343)]: Final variables in output: ['gender', 'age_group', 'income']

INFO (2025-05-23 17:49:32,027) [cbps/cbps (line 461)]: The formula used to build the model matrix: ['income + gender + age_group + _is_na_gender']

INFO (2025-05-23 17:49:32,028) [cbps/cbps (line 473)]: The number of columns in the model matrix: 16

INFO (2025-05-23 17:49:32,029) [cbps/cbps (line 474)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:49:32,035) [cbps/cbps (line 543)]: Finding initial estimator for GMM optimization

INFO (2025-05-23 17:49:32,090) [cbps/cbps (line 570)]: Finding initial estimator for GMM optimization that minimizes the balance loss

INFO (2025-05-23 17:49:32,371) [cbps/cbps (line 605)]: Running GMM optimization

import numpy as np

# No transformations at all
transformations = {
    "age_group": lambda x: x,
    "gender": lambda x: x,
    # "income": lambda x: x,
    "income_log": lambda x: np.log(x.income.fillna(x.income.mean())),
    "income_buckets": lambda x: quantize(x.income.fillna(x.income.mean()), q=5),
}
formula = ["age_group + gender + income_log * income_buckets"]

adjusted = sample_with_target.adjust(
    method="cbps",
    transformations=transformations,
    formula=formula,
    # penalty_factor=penalty_factor, # CBPS seems to ignore the penalty factor.
    # max_de=None,
)

print(adjusted.summary())
print(adjusted.outcomes().summary())
adjusted.covars().plot(library="seaborn", dist_type="kde")

# Trying various transformations gives slightly different results (some effect on the outcome, Deff and ASMD) - but nothing too major here.

INFO (2025-05-23 17:49:33,782) [cbps/cbps (line 411)]: Starting cbps function

INFO (2025-05-23 17:49:33,785) [adjustment/apply_transformations (line 305)]: Adding the variables: ['income_log', 'income_buckets']

INFO (2025-05-23 17:49:33,786) [adjustment/apply_transformations (line 306)]: Transforming the variables: ['age_group', 'gender']

WARNING (2025-05-23 17:49:33,791) [adjustment/apply_transformations (line 340)]: Dropping the variables: ['income']

INFO (2025-05-23 17:49:33,792) [adjustment/apply_transformations (line 343)]: Final variables in output: ['income_log', 'income_buckets', 'age_group', 'gender']

INFO (2025-05-23 17:49:33,894) [cbps/cbps (line 461)]: The formula used to build the model matrix: ['age_group + gender + income_log * income_buckets']

INFO (2025-05-23 17:49:33,895) [cbps/cbps (line 473)]: The number of columns in the model matrix: 15

INFO (2025-05-23 17:49:33,896) [cbps/cbps (line 474)]: The number of rows in the model matrix: 11000

INFO (2025-05-23 17:49:33,901) [cbps/cbps (line 543)]: Finding initial estimator for GMM optimization

INFO (2025-05-23 17:49:34,003) [cbps/cbps (line 570)]: Finding initial estimator for GMM optimization that minimizes the balance loss

# Sessions info
import session_info
session_info.show(html=False, dependencies=True)

-----
balance             0.10.0
numpy               1.26.4
pandas              2.0.3
session_info        v1.0.1
-----
PIL                         11.2.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        25.3.0
attrs                       25.3.0
babel                       2.17.0
certifi                     2025.04.26
charset_normalizer          3.4.2
comm                        0.2.2
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.9.0.post0
debugpy                     1.8.14
decorator                   5.2.1
defusedxml                  0.7.1
exceptiongroup              1.3.0
executing                   2.2.0
fastjsonschema              NA
fqdn                        NA
idna                        3.10
importlib_metadata          NA
importlib_resources         NA
ipfn                        NA
ipykernel                   6.29.5
isoduration                 NA
jedi                        0.19.2
jinja2                      3.1.6
joblib                      1.5.1
json5                       0.12.0
jsonpointer                 3.0.0
jsonschema                  4.23.0
jsonschema_specifications   NA
jupyter_events              0.12.0
jupyter_server              2.16.0
jupyterlab_server           2.27.3
kiwisolver                  1.4.7
markupsafe                  3.0.2
matplotlib                  3.9.4
matplotlib_inline           0.1.7
mpl_toolkits                NA
narwhals                    1.40.0
nbformat                    5.10.4
overrides                   NA
packaging                   25.0
parso                       0.8.4
patsy                       1.0.1
pexpect                     4.9.0
platformdirs                4.3.8
plotly                      6.1.1
prometheus_client           NA
prompt_toolkit              3.0.51
psutil                      7.0.0
ptyprocess                  0.7.0
pure_eval                   0.2.3
pydev_ipython               NA
pydevconsole                NA
pydevd                      3.2.3
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.19.1
pyparsing                   3.2.3
pythonjsonlogger            NA
pytz                        2025.2
referencing                 NA
requests                    2.32.3
rfc3339_validator           0.1.4
rfc3986_validator           0.1.1
rpds                        NA
scipy                       1.10.1
seaborn                     0.13.2
send2trash                  NA
six                         1.17.0
sklearn                     1.2.2
sniffio                     1.3.1
sphinxcontrib               NA
stack_data                  0.6.3
statsmodels                 0.14.4
threadpoolctl               3.6.0
tornado                     6.5.1
traitlets                   5.14.3
typing_extensions           NA
uri_template                NA
urllib3                     2.4.0
wcwidth                     0.2.13
webcolors                   NA
websocket                   1.8.0
yaml                        6.0.2
zipp                        NA
zmq                         26.4.0
zoneinfo                    NA
-----
IPython             8.18.1
jupyter_client      8.6.3
jupyter_core        5.7.2
jupyterlab          4.4.2
notebook            7.4.2
-----
Python 3.9.22 (main, Apr  8 2025, 21:45:32) [GCC 13.3.0]
Linux-6.11.0-1014-azure-x86_64-with-glibc2.39
-----
Session information updated at 2025-05-23 17:49

/opt/hostedtoolcache/Python/3.9.22/x64/lib/python3.9/site-packages/session_info/main.py:213: UserWarning:

The '__version__' attribute is deprecated and will be removed in MarkupSafe 3.1. Use feature detection, or `importlib.metadata.version("markupsafe")`, instead.

balance: transformations and formulas¶

Example dataset - preparing the objects¶

Transformations¶

Basic usage: manipulating existing variables¶

Creating new variables¶

Formula¶

Formula and penalty_factor¶

The impact of transformations and formulas¶

ipw¶

CBPS¶

	metric	val	var
44	model_coef	0.142821	intercept
45	model_coef	0.043803	_is_na_gender[T.True]
46	model_coef	-0.203801	age_group[T.25-34]
47	model_coef	-0.428742	age_group[T.35-44]
48	model_coef	-0.529629	age_group[T.45+]
49	model_coef	0.332477	gender[T.Male]
50	model_coef	0.043803	gender[T._NA]
51	model_coef	0.168359	income[Interval(-0.0009997440000000001, 0.44, ...
52	model_coef	0.152978	income[Interval(0.44, 1.664, closed='right')]
53	model_coef	0.11004	income[Interval(1.664, 3.472, closed='right')]
54	model_coef	-0.042502	income[Interval(11.312, 15.139, closed='right')]
55	model_coef	-0.162192	income[Interval(15.139, 20.567, closed='right')]
56	model_coef	-0.212078	income[Interval(20.567, 29.504, closed='right')]
57	model_coef	-0.358711	income[Interval(29.504, 128.536, closed='right')]
58	model_coef	0.092556	income[Interval(3.472, 5.663, closed='right')]
59	model_coef	0.071799	income[Interval(5.663, 8.211, closed='right')]
60	model_coef	0.00469	income[Interval(8.211, 11.312, closed='right')]

	metric	val	var
44	model_coef	0.998089	intercept
45	model_coef	0.000043	_is_na_gender[T.True]
46	model_coef	-0.212989	age_group[T.25-34]
47	model_coef	-0.440492	age_group[T.35-44]
48	model_coef	-0.545653	age_group[T.45+]
49	model_coef	-0.188196	gender[Female]
50	model_coef	0.18171	gender[Male]
51	model_coef	0.000043	gender[_NA]
52	model_coef	-0.570551	income

	metric	val	var
44	model_coef	-0.327112	intercept
45	model_coef	0.031737	_is_na_gender[T.True]
46	model_coef	-0.181989	age_group[T.35-44]
47	model_coef	0.098871	age_group[T._lumped_other]
48	model_coef	0.241586	gender[T.Male]
49	model_coef	0.031737	gender[T._NA]
50	model_coef	0.19926	income[Interval(-0.0009997440000000001, 4.194,...
51	model_coef	-0.283084	income[Interval(13.693, 128.536, closed='right')]
52	model_coef	0.048146	income[Interval(4.194, 13.693, closed='right')]

	metric	val	var
44	model_coef	-0.035465	intercept
45	model_coef	0.001051	_is_na_gender[T.True]
46	model_coef	-0.141695	gender[Female]
47	model_coef	0.136225	gender[Male]
48	model_coef	0.001051	gender[_NA]

	metric	val	var
44	model_coef	0.41745	intercept
45	model_coef	0.044654	_is_na_gender[T.True]
46	model_coef	-0.194436	age_group[T.25-34]
47	model_coef	-0.421207	age_group[T.35-44]
48	model_coef	-0.521732	age_group[T.45+]
49	model_coef	0.325806	gender[T.Male]
50	model_coef	0.044654	gender[T._NA]
51	model_coef	-0.26602	income
52	model_coef	0.113306	income_buckets[Interval(-0.0009997440000000001...
53	model_coef	-0.166366	income_buckets[Interval(13.693, 128.536, close...
54	model_coef	0.032296	income_buckets[Interval(4.194, 13.693, closed=...
55	model_coef	-0.185931	income_squared

	metric	val	var
44	model_coef	-0.348132	intercept
45	model_coef	0.334726	age_group[18-24]
46	model_coef	0.005414	age_group[25-34]
47	model_coef	-0.160449	age_group[35-44]
48	model_coef	-0.280501	age_group[45+]
49	model_coef	0.032004	age_group[T.25-34]:gender[T.Male]
50	model_coef	0.016476	age_group[T.25-34]:gender[T._NA]
51	model_coef	-0.037461	age_group[T.35-44]:gender[T.Male]
52	model_coef	-0.046181	age_group[T.35-44]:gender[T._NA]
53	model_coef	-0.031939	age_group[T.45+]:gender[T.Male]
54	model_coef	-0.042359	age_group[T.45+]:gender[T._NA]
55	model_coef	0.271811	gender[T.Male]
56	model_coef	0.06233	gender[T._NA]

	metric	val	var
44	model_coef	0.243384	intercept
45	model_coef	3.238671	age_group[18-24]
46	model_coef	0.394707	age_group[25-34]
47	model_coef	-1.759083	age_group[35-44]
48	model_coef	-2.919449	age_group[45+]
49	model_coef	2.599977	gender[T.Male]
50	model_coef	0.487265	gender[T._NA]
51	model_coef	-0.073744	income

	metric	val	var
44	model_coef	-0.436751	intercept
45	model_coef	0.053355	age_group[18-24]
46	model_coef	0.014525	age_group[25-34]
47	model_coef	-0.014981	age_group[35-44]
48	model_coef	-0.037504	age_group[45+]
49	model_coef	0.041851	gender[T.Male]
50	model_coef	0.011851	gender[T._NA]
51	model_coef	-3.824697	income

	metric	val	var
44	model_coef	-0.286191	intercept
45	model_coef	0.378645	age_group[18-24]
46	model_coef	0.047595	age_group[25-34]
47	model_coef	-0.199745	age_group[35-44]
48	model_coef	-0.350982	age_group[45+]
49	model_coef	0.321907	gender[T.Male]
50	model_coef	0.074456	gender[T._NA]
51	model_coef	-0.418356	income
52	model_coef	0.216575	income_buckets[Interval(-0.0009997440000000001...
53	model_coef	-0.366924	income_buckets[Interval(17.694, 128.536, close...
54	model_coef	0.12985	income_buckets[Interval(2.53, 8.211, closed='r...
55	model_coef	-0.05101	income_buckets[Interval(8.211, 17.694, closed=...

	metric	val	var
44	model_coef	0.084942	intercept
45	model_coef	0.327596	age_group[18-24]
46	model_coef	0.039282	age_group[25-34]
47	model_coef	-0.176394	age_group[35-44]
48	model_coef	-0.296639	age_group[45+]
49	model_coef	-0.163857	gender[Female]
50	model_coef	0.158443	gender[Male]
51	model_coef	-0.000375	gender[_NA]
52	model_coef	-0.258364	income
53	model_coef	0.122028	income_buckets[Interval(-0.0009997440000000001...
54	model_coef	-0.213971	income_buckets[Interval(17.694, 128.536, close...
55	model_coef	0.076064	income_buckets[Interval(2.53, 8.211, closed='r...
56	model_coef	-0.025416	income_buckets[Interval(8.211, 17.694, closed=...