Source code for balance.stats_and_plots.ascii_plots

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

from __future__ import annotations

import logging
from typing import Dict, List, Optional, Tuple

import numpy as np
import numpy.typing as npt
import pandas as pd
from balance.stats_and_plots.weighted_comparisons_plots import (
    DataFrameWithWeight,
    naming_legend,
)
from balance.stats_and_plots.weighted_stats import relative_frequency_table
from balance.stats_and_plots.weights_stats import _check_weights_are_valid
from balance.util import choose_variables, rm_mutual_nas

logger: logging.Logger = logging.getLogger(__package__)

# Characters used to distinguish datasets in ASCII bars.
# Each dataset gets a unique character from this list.
BAR_CHARS: List[str] = ["█", "▒", "▐", "░", "▄", "▀"]

# Preferred ordering for comparative plots: population first, then adjusted,
# then sample.  Known internal names are placed in this order; any unknown
# names are appended at the end in their original order.
_PREFERRED_NAME_ORDER: List[str] = ["target", "self", "adjusted", "unadjusted"]


def _reorder_dfs_and_names(
    dfs: List[DataFrameWithWeight],
    names: List[str],
) -> Tuple[List[DataFrameWithWeight], List[str]]:
    """Reorder *dfs* and *names* to the preferred display order.

    The canonical display order is: population (``target``), adjusted
    (``self`` when ``unadjusted`` is also present), sample
    (``unadjusted``).  Names not in the preferred list keep their
    original relative order and are appended after the known names.
    """
    order_map = {name: i for i, name in enumerate(_PREFERRED_NAME_ORDER)}
    indexed = list(enumerate(names))
    # Stable sort: known names by preferred position, unknown names stay in
    # their original order at the end.
    indexed.sort(key=lambda x: (order_map.get(x[1], len(_PREFERRED_NAME_ORDER)), x[0]))
    reordered_indices = [i for i, _ in indexed]
    return (
        [dfs[i] for i in reordered_indices],
        [names[i] for i in reordered_indices],
    )


def _auto_n_bins(n_samples: int, n_unique: int) -> int:
    """Pick a number of bins using Sturges' rule, capped at unique values."""
    import math

    if n_samples <= 1:
        return 1
    sturges = math.ceil(math.log2(n_samples) + 1)
    # Don't exceed the number of unique values, and clamp to [2, 50]
    return max(2, min(sturges, n_unique, 50))


def _auto_bar_width(label_width: int) -> int:
    """Pick bar_width to fit within terminal width.

    Used by grouped barplots and histograms where each dataset gets its own
    line within a row (single bar per line).
    """
    import shutil

    term_width = shutil.get_terminal_size((80, 24)).columns
    # Each line: label_width + " | " (3) + bar + " (XX.X%)" (9)
    available = term_width - label_width - 3 - 9
    return max(10, available)


def _auto_bar_width_columnar(range_width: int, n_columns: int) -> int:
    """Pick per-column bar_width for a columnar (side-by-side) layout.

    Used by :func:`ascii_comparative_hist` where all datasets are rendered as
    columns on the same line.  Each column needs space for the bar, a
    percentage string (~6 chars), and inter-column separators (`` | ``, 3
    chars each).
    """
    import shutil

    term_width = shutil.get_terminal_size((80, 24)).columns
    # "Range  | col1 | col2 | ..."
    # range_width + " | " (3+1 for padding) consumed by the label column
    available = term_width - range_width - 4
    per_col = max(10, (available - (n_columns - 1) * 3) // n_columns - 6)
    return per_col


def _weighted_histogram(
    values: pd.Series,
    weights: Optional[pd.Series],
    bin_edges: npt.NDArray[np.floating],
) -> npt.NDArray[np.floating]:
    """Computes a weighted histogram and normalizes counts to proportions.

    Args:
        values: The numeric data values.
        weights: Optional weights. If None, uniform weights are used.
        bin_edges: Pre-computed bin edges (length n_bins + 1).

    Returns:
        Array of proportions for each bin (sums to 1.0, or all zeros if empty).
    """
    _check_weights_are_valid(weights)
    weights_arr: Optional[npt.NDArray[np.floating]] = None
    if weights is not None:
        weights_arr = np.asarray(weights, dtype=float)
    counts, _ = np.histogram(values, bins=bin_edges, weights=weights_arr)
    total = counts.sum()
    if total > 0:
        return counts / total
    return np.zeros_like(counts, dtype=float)


def _render_horizontal_bars(
    label: str,
    proportions: Dict[str, float],
    legend_names: List[str],
    bar_width: int,
    max_value: float,
    label_width: int,
) -> str:
    """Renders a group of horizontal bars for one category or bin.

    Each dataset gets its own line with a distinct character and a percentage
    label at the end.  When a proportion is non-zero but too small to render
    even one bar character, a single dot (``.``) is shown so that the reader
    can distinguish "present but tiny" from "truly zero".

    Args:
        label: The category label or bin range string.
        proportions: Dict mapping dataset legend name to its proportion value.
        legend_names: Ordered list of legend names for consistent ordering.
        bar_width: Maximum character width of the longest bar.
        max_value: The maximum proportion value across all bars (used for scaling).
        label_width: Character width reserved for the label column.

    Returns:
        Multi-line string of the grouped bars for this label.
    """
    lines: List[str] = []
    for i, name in enumerate(legend_names):
        prop = proportions.get(name, 0.0)
        char = BAR_CHARS[i % len(BAR_CHARS)]
        if max_value > 0:
            bar_len = int(round((prop / max_value) * bar_width))
        else:
            bar_len = 0
        if bar_len > 0:
            bar = char * bar_len
        elif prop > 0:
            # Non-zero proportion too small to render — show a dot
            bar = "."
        else:
            bar = ""
        if i == 0:
            prefix = label.ljust(label_width)
        else:
            prefix = " " * label_width
        lines.append(f"{prefix} | {bar} ({prop:.1%})")
    return "\n".join(lines)


def _build_legend(legend_names: List[str]) -> str:
    """Builds a legend string mapping characters to dataset names.

    Args:
        legend_names: Ordered list of dataset legend names.

    Returns:
        A two-line legend string: the first line maps bar characters to
        dataset names; the second explains how to interpret bar lengths.
    """
    parts: List[str] = []
    for i, name in enumerate(legend_names):
        char = BAR_CHARS[i % len(BAR_CHARS)]
        parts.append(f"{char} {name}")
    return (
        "Legend: "
        + "  ".join(parts)
        + "\nBar lengths are proportional to weighted frequency within each dataset."
    )


[docs] def ascii_plot_bar( dfs: List[DataFrameWithWeight], names: List[str], column: str, weighted: bool = True, bar_width: Optional[int] = None, dist_type: Optional[str] = None, separate_categories: bool = True, ) -> str: """Produces an ASCII grouped barplot for a single categorical variable. Uses :func:`relative_frequency_table` to compute weighted proportions for each dataset, then renders grouped horizontal bars. How to read the output: Each row is a category value. Within a row, each dataset gets its own bar drawn with a distinct fill character (``█``, ``▓``, etc.). - The percentage at the end of each bar is the weighted proportion of that category within its dataset (i.e., proportions within each dataset sum to 100%). - Bar lengths are scaled so that the longest bar across all datasets spans the full ``bar_width``. Args: dfs: List of DataFrameWithWeight dicts. names: Names for each DataFrame (e.g., ["self", "target"]). column: The categorical column name to plot. weighted: Whether to use weights. Defaults to True. bar_width: Maximum character width for bars. Defaults to None, which auto-detects based on terminal width. dist_type: Accepted for compatibility but only "hist_ascii" is supported. A warning is logged if any other value is passed. separate_categories: If True, insert a blank line between categories for readability. Defaults to True. Returns: ASCII barplot text for this variable. Example: :: >>> df_a = pd.DataFrame({"color": ["red", "blue", "blue", "green"]}) >>> df_b = pd.DataFrame({"color": ["red", "red", "blue", "green"]}) >>> dfs = [ ... {"df": df_a, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])}, ... {"df": df_b, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])}, ... ] >>> print(ascii_plot_bar(dfs, names=["self", "target"], ... column="color", bar_width=20)) === color (categorical) === <BLANKLINE> Category | sample population | blue | ████████████████████ (50.0%) | ▒▒▒▒▒▒▒▒▒▒ (25.0%) <BLANKLINE> green | ██████████ (25.0%) | ▒▒▒▒▒▒▒▒▒▒ (25.0%) <BLANKLINE> red | ██████████ (25.0%) | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%) <BLANKLINE> Legend: █ sample ▒ population Bar lengths are proportional to weighted frequency within each dataset. <BLANKLINE> """ if dist_type is not None and dist_type != "hist_ascii": logger.warning( f"ASCII plots only support dist_type='hist_ascii'. " f"Ignoring dist_type='{dist_type}' and using 'hist_ascii'." ) legend_names: List[str] = [naming_legend(n, names) for n in names] # Compute proportions per dataset all_props: List[pd.DataFrame] = [] for ii, d in enumerate(dfs): a_series = d["df"][column] _w = d["weight"] if weighted and _w is not None: a_series, _w = rm_mutual_nas(a_series, _w) a_series.name = column freq_table = relative_frequency_table(a_series, w=_w if weighted else None) freq_table["dataset"] = legend_names[ii] all_props.append(freq_table) combined = pd.concat(all_props, ignore_index=True) # Get all unique categories in stable order categories: List[str] = list(combined[column].unique()) # Find max proportion for bar scaling max_value: float = float(combined["prop"].max()) if len(combined) > 0 else 1.0 # Compute label width label_width = max(len(str(c)) for c in categories) if categories else 8 label_width = max(label_width, 8) # minimum width for "Category" if bar_width is None: bar_width = _auto_bar_width(label_width) # Build output lines: List[str] = [] lines.append(f"=== {column} (categorical) ===") lines.append("") # Header header_label = "Category".ljust(label_width) lines.append(f"{header_label} | {' '.join(legend_names)}") lines.append(f"{' ' * label_width} |") for ci, cat in enumerate(categories): if separate_categories and ci > 0: lines.append("") cat_data = combined[combined[column] == cat] proportions: Dict[str, float] = {} for _, row in cat_data.iterrows(): proportions[row["dataset"]] = float(row["prop"]) lines.append( _render_horizontal_bars( str(cat), proportions, legend_names, bar_width, max_value, label_width ) ) lines.append("") lines.append(_build_legend(legend_names)) lines.append("") return "\n".join(lines)
[docs] def ascii_plot_hist( dfs: List[DataFrameWithWeight], names: List[str], column: str, weighted: bool = True, n_bins: Optional[int] = None, bar_width: Optional[int] = None, dist_type: Optional[str] = None, ) -> str: """Produces an ASCII histogram for a single numeric variable. Computes weighted histogram bins across all datasets using a shared bin range, then renders grouped horizontal bars for each bin. How to read the output: Each row is a numeric bin range. Within a row, each dataset gets its own bar drawn with a distinct fill character (``█``, ``▓``, etc.). - The percentage at the end of each bar is the weighted proportion of observations falling in that bin within its dataset (i.e., proportions within each dataset sum to 100%). - Bar lengths are scaled so that the longest bar across all datasets spans the full ``bar_width``. Args: dfs: List of DataFrameWithWeight dicts. names: Names for each DataFrame (e.g., ["self", "target"]). column: The numeric column name to plot. weighted: Whether to use weights. Defaults to True. n_bins: Number of histogram bins. Defaults to None, which auto-detects using Sturges' rule. bar_width: Maximum character width for bars. Defaults to None, which auto-detects based on terminal width. dist_type: Accepted for compatibility but only "hist_ascii" is supported. A warning is logged if any other value is passed. Returns: ASCII histogram text for this variable. Example: :: >>> df_a = pd.DataFrame({"age": [10.0, 20.0, 30.0, 40.0]}) >>> df_b = pd.DataFrame({"age": [10.0, 10.0, 10.0, 40.0]}) >>> dfs = [ ... {"df": df_a, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])}, ... {"df": df_b, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])}, ... ] >>> print(ascii_plot_hist(dfs, names=["self", "target"], ... column="age", n_bins=2, bar_width=20)) === age (numeric) === <BLANKLINE> Bin | sample population | [10.00, 25.00) | █████████████ (50.0%) | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (75.0%) [25.00, 40.00] | █████████████ (50.0%) | ▒▒▒▒▒▒▒ (25.0%) <BLANKLINE> Legend: █ sample ▒ population Bar lengths are proportional to weighted frequency within each dataset. <BLANKLINE> """ if dist_type is not None and dist_type != "hist_ascii": logger.warning( f"ASCII plots only support dist_type='hist_ascii'. " f"Ignoring dist_type='{dist_type}' and using 'hist_ascii'." ) legend_names: List[str] = [naming_legend(n, names) for n in names] # Collect all values to determine shared bin range all_values: List[pd.Series] = [] all_weights: List[Optional[pd.Series]] = [] for d in dfs: a_series = d["df"][column] _w = d["weight"] if weighted and _w is not None: a_series, _w = rm_mutual_nas(a_series, _w) else: a_series = a_series.dropna() all_values.append(a_series) all_weights.append(_w if weighted else None) # Compute shared bin edges combined_values = pd.concat(all_values, ignore_index=True) if len(combined_values) == 0: return f"=== {column} (numeric) ===\n\nNo data available.\n" if n_bins is None: n_bins = _auto_n_bins(len(combined_values), combined_values.nunique()) global_min = float(combined_values.min()) global_max = float(combined_values.max()) # Handle edge case where all values are the same if global_min == global_max: global_min = global_min - 0.5 global_max = global_max + 0.5 bin_edges = np.linspace(global_min, global_max, n_bins + 1) # Compute histograms per dataset hist_data: List[npt.NDArray[np.floating]] = [] for vals, wts in zip(all_values, all_weights): hist_data.append(_weighted_histogram(vals, wts, bin_edges)) # Build bin labels bin_labels: List[str] = [] for i in range(n_bins): left = bin_edges[i] right = bin_edges[i + 1] bracket_right = "]" if i == n_bins - 1 else ")" bin_labels.append(f"[{left:,.2f}, {right:,.2f}{bracket_right}") # Find max proportion for bar scaling max_value: float = max(float(h.max()) for h in hist_data) if hist_data else 1.0 # Compute label width label_width = max(len(lbl) for lbl in bin_labels) if bin_labels else 8 label_width = max(label_width, 3) # minimum width for "Bin" if bar_width is None: bar_width = _auto_bar_width(label_width) # Build output lines: List[str] = [] lines.append(f"=== {column} (numeric) ===") lines.append("") # Header header_label = "Bin".ljust(label_width) lines.append(f"{header_label} | {' '.join(legend_names)}") lines.append(f"{' ' * label_width} |") for bi, lbl in enumerate(bin_labels): proportions: Dict[str, float] = {} for di, name in enumerate(legend_names): proportions[name] = float(hist_data[di][bi]) lines.append( _render_horizontal_bars( lbl, proportions, legend_names, bar_width, max_value, label_width ) ) lines.append("") lines.append(_build_legend(legend_names)) lines.append("") return "\n".join(lines)
[docs] def ascii_comparative_hist( dfs: List[DataFrameWithWeight], names: List[str], column: str, weighted: bool = True, n_bins: Optional[int] = None, bar_width: Optional[int] = None, ) -> str: """Produces a columnar, baseline-relative ASCII histogram. The first dataset is the baseline. Subsequent datasets show bars split into segments that indicate how each bin compares to the baseline. How to read the output: Each row is a bin range. The first column is the baseline dataset, shown with solid ``█`` bars. For every other column: - ``█`` (solid fill) = the portion of the bar that matches the baseline proportion. This is the "common" part. - ``▒`` (medium shade) = the portion that **exceeds** the baseline. The bin has more mass than the baseline in this range. - `` ]`` (right bracket) = a **deficit** relative to the baseline. The gap before the bracket shows how much mass is missing compared to the baseline in this range. - A number without any bar means the percentage is too small to render at the chosen ``bar_width``. All percentages are normalized so each column sums to 100%. Args: dfs: List of DataFrameWithWeight dicts. The first entry is used as the baseline for comparison. names: Names for each DataFrame (e.g., ["Target", "Sample"]). column: The numeric column name to plot. weighted: Whether to use weights. Defaults to True. n_bins: Number of histogram bins. Defaults to None, which auto-detects using Sturges' rule. bar_width: Maximum character width for bars. Defaults to None, which auto-detects based on terminal width. Returns: ASCII comparative histogram text. Example: :: >>> print(ascii_comparative_hist(dfs, names=["Target", "Sample"], ... column="income", n_bins=2, bar_width=20)) === income (numeric, comparative) === <BLANKLINE> Range | Target (%) | Sample (%) --------------------------------------------------------------- [10.00, 25.00) | █████████████ 50.0 | █████████████▒▒▒▒▒▒▒ 75.0 [25.00, 40.00] | █████████████ 50.0 | ███████ ] 25.0 --------------------------------------------------------------- Total | 100.0 | 100.0 In the Sample column above, bin [10, 25) shows ``▒`` excess (75% vs 50% baseline) while bin [25, 40] shows `` ]`` deficit (25% vs 50% baseline). """ legend_names: List[str] = [naming_legend(n, names) for n in names] # Collect all values to determine shared bin range all_values: List[pd.Series] = [] all_weights: List[Optional[pd.Series]] = [] for d in dfs: a_series = d["df"][column] _w = d["weight"] if weighted and _w is not None: a_series, _w = rm_mutual_nas(a_series, _w) else: a_series = a_series.dropna() all_values.append(a_series) all_weights.append(_w if weighted else None) # Compute shared bin edges combined_values = pd.concat(all_values, ignore_index=True) if len(combined_values) == 0: return "No data available." if n_bins is None: n_bins = _auto_n_bins(len(combined_values), combined_values.nunique()) global_min = float(combined_values.min()) global_max = float(combined_values.max()) if global_min == global_max: global_min = global_min - 0.5 global_max = global_max + 0.5 bin_edges = np.linspace(global_min, global_max, n_bins + 1) # Compute histograms per dataset (as percentages) hist_pcts: List[List[float]] = [] for vals, wts in zip(all_values, all_weights): props = _weighted_histogram(vals, wts, bin_edges) hist_pcts.append([float(p) * 100.0 for p in props]) # Find max percentage across all datasets and bins for bar scaling max_pct: float = max( (pct for pcts in hist_pcts for pct in pcts), default=0.0, ) # Build bin labels bin_labels: List[str] = [] for i in range(n_bins): left = bin_edges[i] right = bin_edges[i + 1] bracket_right = "]" if i == n_bins - 1 else ")" bin_labels.append(f"[{left:,.2f}, {right:,.2f}{bracket_right}") # Range column width (computed early so bar_width auto-detection can use it) range_header = "Range" range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels)) if bar_width is None: bar_width = _auto_bar_width_columnar(range_width, len(legend_names)) # Baseline percentages (first dataset) baseline_pcts = hist_pcts[0] # Build cell strings for each dataset x bin # cell_strings[dataset_idx][bin_idx] = "bar_chars pct" cell_strings: List[List[str]] = [] for di in range(len(hist_pcts)): cells: List[str] = [] for bi in range(n_bins): pct = hist_pcts[di][bi] if max_pct > 0: bar_len = round((pct / max_pct) * bar_width) else: bar_len = 0 if di == 0: # Baseline: simple filled bars bar = "█" * bar_len else: base_pct = baseline_pcts[bi] if max_pct > 0: baseline_len = round((base_pct / max_pct) * bar_width) else: baseline_len = 0 if bar_len >= baseline_len: bar = "█" * baseline_len + "▒" * (bar_len - baseline_len) else: deficit = baseline_len - bar_len if deficit >= 2: bar = "█" * bar_len + " " * (deficit - 1) + "]" else: bar = "█" * bar_len + "]" pct_str = f"{pct:.1f}" if bar: cells.append(f"{bar} {pct_str}") else: cells.append(pct_str) cell_strings.append(cells) # Compute column widths col_widths: List[int] = [] for di in range(len(legend_names)): header_w = len(f"{legend_names[di]} (%)") max_cell_w = max(len(cell_strings[di][bi]) for bi in range(n_bins)) col_widths.append(max(header_w, max_cell_w)) # Build output lines: List[str] = [] lines.append(f"=== {column} (numeric, comparative) ===") lines.append("") # Header row header_parts = [range_header.ljust(range_width)] for di in range(len(legend_names)): header_parts.append(f"{legend_names[di]} (%)".ljust(col_widths[di])) lines.append(" | ".join(header_parts)) # Separator sep_width = range_width + sum(col_widths) + 3 * len(col_widths) lines.append("-" * sep_width) # Data rows for bi in range(n_bins): row_parts = [bin_labels[bi].ljust(range_width)] for di in range(len(legend_names)): row_parts.append(cell_strings[di][bi].ljust(col_widths[di])) lines.append(" | ".join(row_parts)) # Separator lines.append("-" * sep_width) # Total row total_parts = ["Total".ljust(range_width)] for di in range(len(hist_pcts)): total_val = sum(hist_pcts[di]) total_parts.append(f"{total_val:.1f}".ljust(col_widths[di])) lines.append(" | ".join(total_parts)) # Legend (only when there are non-baseline columns) if len(legend_names) > 1: lines.append("") lines.append( f"Key: █ = shared with {legend_names[0]}," " ▒ = excess, ] = deficit" ) return "\n".join(lines)
[docs] def ascii_plot_dist( dfs: List[DataFrameWithWeight], names: Optional[List[str]] = None, variables: Optional[List[str]] = None, numeric_n_values_threshold: int = 15, weighted: bool = True, n_bins: Optional[int] = None, bar_width: Optional[int] = None, dist_type: Optional[str] = None, separate_categories: bool = True, comparative: bool = True, ) -> str: """Produces ASCII text comparing weighted distributions across datasets. Iterates over variables, classifying each as categorical or numeric (using the same logic as :func:`seaborn_plot_dist`), then delegates to the appropriate plotting function. Two display modes are available for numeric variables: - **comparative** (``comparative=True``, the default): numeric variables are rendered with :func:`ascii_comparative_hist`, a columnar layout where the first dataset is the baseline and subsequent datasets show excess / deficit relative to it. - **grouped** (``comparative=False``): numeric variables are rendered with :func:`ascii_plot_hist`, a grouped-bar layout where each dataset gets its own bar per bin (the same style used for categorical variables). Categorical variables always use :func:`ascii_plot_bar` regardless of this setting. The output is both printed to stdout and returned as a string. Args: dfs: List of DataFrameWithWeight dicts. names: Names for each DataFrame (e.g., ["self", "unadjusted", "target"]). If None, defaults to "df_0", "df_1", etc. variables: Subset of variables to plot. None means all. numeric_n_values_threshold: Columns with fewer unique values than this are treated as categorical. Defaults to 15. weighted: Whether to use weights. Defaults to True. n_bins: Number of bins for numeric histograms. Defaults to None, which auto-detects using Sturges' rule. bar_width: Maximum character width for the longest bar. Defaults to None, which auto-detects based on terminal width. dist_type: Accepted for compatibility but only "hist_ascii" is supported. A warning is logged if any other value is passed. separate_categories: If True, insert a blank line between categories in barplots for readability. Defaults to True. comparative: If True (default), numeric variables use a columnar comparative histogram (:func:`ascii_comparative_hist`) that highlights differences relative to a baseline dataset. If False, numeric variables use a grouped-bar histogram (:func:`ascii_plot_hist`) instead. Returns: The full ASCII output text. Examples: :: >>> import pandas as pd >>> from balance.stats_and_plots.ascii_plots import ascii_plot_dist >>> df_a = pd.DataFrame({ ... "color": ["red", "blue", "blue", "green"], ... "age": [10.0, 20.0, 30.0, 40.0], ... }) >>> df_b = pd.DataFrame({ ... "color": ["red", "red", "blue", "green"], ... "age": [10.0, 10.0, 10.0, 40.0], ... }) >>> dfs = [ ... {"df": df_a, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])}, ... {"df": df_b, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])}, ... ] >>> print(ascii_plot_dist(dfs, names=["self", "target"], ... numeric_n_values_threshold=0, n_bins=2, bar_width=20)) === color (categorical) === <BLANKLINE> Category | population sample | blue | ██████████ (25.0%) | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%) <BLANKLINE> green | ██████████ (25.0%) | ▒▒▒▒▒▒▒▒▒▒ (25.0%) <BLANKLINE> red | ████████████████████ (50.0%) | ▒▒▒▒▒▒▒▒▒▒ (25.0%) <BLANKLINE> Legend: █ population ▒ sample Bar lengths are proportional to weighted frequency within each dataset. <BLANKLINE> === age (numeric, comparative) === <BLANKLINE> Range | population (%) | sample (%) --------------------------------------------------------------- [10.00, 25.00) | █████████████ 50.0 | █████████████▒▒▒▒▒▒▒ 75.0 [25.00, 40.00] | █████████████ 50.0 | ███████ ] 25.0 --------------------------------------------------------------- Total | 100.0 | 100.0 <BLANKLINE> Key: █ = shared with population, ▒ = excess, ] = deficit To use grouped-bar histograms (same style as categorical) instead of comparative histograms for numeric variables, pass ``comparative=False``:: >>> print(ascii_plot_dist(dfs, names=["self", "target"], ... numeric_n_values_threshold=0, n_bins=2, bar_width=20, ... comparative=False)) === color (categorical) === ... === age (numeric) === <BLANKLINE> Bin | population sample | [10.00, 25.00) | ████████████████████ (75.0%) | ▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%) [25.00, 40.00] | ███████ (25.0%) | ▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%) <BLANKLINE> Legend: █ population ▒ sample Bar lengths are proportional to weighted frequency within each dataset. <BLANKLINE> """ if dist_type is not None and dist_type != "hist_ascii": logger.warning( f"ASCII plots only support dist_type='hist_ascii'. " f"Ignoring dist_type='{dist_type}' and using 'hist_ascii'." ) if names is None: names = [f"df_{i}" for i in range(len(dfs))] # Reorder so comparative plots show: population, adjusted, sample dfs, names = _reorder_dfs_and_names(dfs, names) variables = choose_variables(*(d["df"] for d in dfs), variables=variables) logger.debug(f"ASCII plotting variables {variables}") numeric_variables = dfs[0]["df"].select_dtypes(exclude=["object"]).columns.values output_parts: List[str] = [] for o in variables: # Find the maximum number of non-missing unique values across all dfs n_values = max(len(set(rm_mutual_nas(d["df"].loc[:, o].values))) for d in dfs) if n_values == 0: logger.warning(f"No nonmissing values for variable '{o}', skipping") continue categorical = (o not in numeric_variables) or ( n_values < numeric_n_values_threshold ) if categorical: output_parts.append( ascii_plot_bar( dfs, names, o, weighted=weighted, bar_width=bar_width, separate_categories=separate_categories, ) ) else: if comparative: output_parts.append( ascii_comparative_hist( dfs, names, o, weighted=weighted, n_bins=n_bins, bar_width=bar_width, ) ) else: output_parts.append( ascii_plot_hist( dfs, names, o, weighted=weighted, n_bins=n_bins, bar_width=bar_width, ) ) result = "\n".join(output_parts) print(result) return result