Source code for balance.stats_and_plots.ascii_plots

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

from __future__ import annotations

import logging
from typing import Dict, List, Optional, Tuple

import numpy as np
import numpy.typing as npt
import pandas as pd
from balance.stats_and_plots.weighted_comparisons_plots import (
    DataFrameWithWeight,
    naming_legend,
)
from balance.stats_and_plots.weighted_stats import relative_frequency_table
from balance.stats_and_plots.weights_stats import _check_weights_are_valid
from balance.util import choose_variables, rm_mutual_nas

logger: logging.Logger = logging.getLogger(__package__)

# Characters used to distinguish datasets in ASCII bars.
# Each dataset gets a unique character from this list.
BAR_CHARS: List[str] = ["█", "▒", "▐", "░", "▄", "▀"]

# Preferred ordering for comparative plots: population first, then adjusted,
# then sample.  Known internal names are placed in this order; any unknown
# names are appended at the end in their original order.
_PREFERRED_NAME_ORDER: List[str] = ["target", "self", "adjusted", "unadjusted"]


def _reorder_dfs_and_names(
    dfs: List[DataFrameWithWeight],
    names: List[str],
) -> Tuple[List[DataFrameWithWeight], List[str]]:
    """Reorder *dfs* and *names* to the preferred display order.

    The canonical display order is: population (``target``), adjusted
    (``self`` when ``unadjusted`` is also present), sample
    (``unadjusted``).  Names not in the preferred list keep their
    original relative order and are appended after the known names.
    """
    order_map = {name: i for i, name in enumerate(_PREFERRED_NAME_ORDER)}
    indexed = list(enumerate(names))
    # Stable sort: known names by preferred position, unknown names stay in
    # their original order at the end.
    indexed.sort(key=lambda x: (order_map.get(x[1], len(_PREFERRED_NAME_ORDER)), x[0]))
    reordered_indices = [i for i, _ in indexed]
    return (
        [dfs[i] for i in reordered_indices],
        [names[i] for i in reordered_indices],
    )


def _auto_n_bins(n_samples: int, n_unique: int) -> int:
    """Pick a number of bins using Sturges' rule, capped at unique values."""
    import math

    if n_samples <= 1:
        return 1
    sturges = math.ceil(math.log2(n_samples) + 1)
    # Don't exceed the number of unique values, and clamp to [2, 50]
    return max(2, min(sturges, n_unique, 50))


def _auto_bar_width(label_width: int) -> int:
    """Pick bar_width to fit within terminal width.

    Used by grouped barplots and histograms where each dataset gets its own
    line within a row (single bar per line).
    """
    import shutil

    term_width = shutil.get_terminal_size((80, 24)).columns
    # Each line: label_width + " | " (3) + bar + " (XX.X%)" (9)
    available = term_width - label_width - 3 - 9
    return max(10, available)


def _auto_bar_width_columnar(range_width: int, n_columns: int) -> int:
    """Pick per-column bar_width for a columnar (side-by-side) layout.

    Used by :func:`ascii_comparative_hist` where all datasets are rendered as
    columns on the same line.  Each column needs space for the bar, a
    percentage string (~6 chars), and inter-column separators (`` | ``, 3
    chars each).
    """
    import shutil

    term_width = shutil.get_terminal_size((80, 24)).columns
    # "Range  | col1 | col2 | ..."
    # range_width + " | " (3+1 for padding) consumed by the label column
    available = term_width - range_width - 4
    per_col = max(10, (available - (n_columns - 1) * 3) // n_columns - 6)
    return per_col


def _weighted_histogram(
    values: pd.Series,
    weights: Optional[pd.Series],
    bin_edges: npt.NDArray[np.floating],
) -> npt.NDArray[np.floating]:
    """Computes a weighted histogram and normalizes counts to proportions.

    Args:
        values: The numeric data values.
        weights: Optional weights. If None, uniform weights are used.
        bin_edges: Pre-computed bin edges (length n_bins + 1).

    Returns:
        Array of proportions for each bin (sums to 1.0, or all zeros if empty).
    """
    _check_weights_are_valid(weights)
    weights_arr: Optional[npt.NDArray[np.floating]] = None
    if weights is not None:
        weights_arr = np.asarray(weights, dtype=float)
    counts, _ = np.histogram(values, bins=bin_edges, weights=weights_arr)
    total = counts.sum()
    if total > 0:
        return counts / total
    return np.zeros_like(counts, dtype=float)


def _render_horizontal_bars(
    label: str,
    proportions: Dict[str, float],
    legend_names: List[str],
    bar_width: int,
    max_value: float,
    label_width: int,
) -> str:
    """Renders a group of horizontal bars for one category or bin.

    Each dataset gets its own line with a distinct character and a percentage
    label at the end.  When a proportion is non-zero but too small to render
    even one bar character, a single dot (``.``) is shown so that the reader
    can distinguish "present but tiny" from "truly zero".

    Args:
        label: The category label or bin range string.
        proportions: Dict mapping dataset legend name to its proportion value.
        legend_names: Ordered list of legend names for consistent ordering.
        bar_width: Maximum character width of the longest bar.
        max_value: The maximum proportion value across all bars (used for scaling).
        label_width: Character width reserved for the label column.

    Returns:
        Multi-line string of the grouped bars for this label.
    """
    lines: List[str] = []
    for i, name in enumerate(legend_names):
        prop = proportions.get(name, 0.0)
        char = BAR_CHARS[i % len(BAR_CHARS)]
        if max_value > 0:
            bar_len = int(round((prop / max_value) * bar_width))
        else:
            bar_len = 0
        if bar_len > 0:
            bar = char * bar_len
        elif prop > 0:
            # Non-zero proportion too small to render — show a dot
            bar = "."
        else:
            bar = ""
        if i == 0:
            prefix = label.ljust(label_width)
        else:
            prefix = " " * label_width
        lines.append(f"{prefix} | {bar} ({prop:.1%})")
    return "\n".join(lines)


def _build_legend(legend_names: List[str]) -> str:
    """Builds a legend string mapping characters to dataset names.

    Args:
        legend_names: Ordered list of dataset legend names.

    Returns:
        A two-line legend string: the first line maps bar characters to
        dataset names; the second explains how to interpret bar lengths.
    """
    parts: List[str] = []
    for i, name in enumerate(legend_names):
        char = BAR_CHARS[i % len(BAR_CHARS)]
        parts.append(f"{char} {name}")
    return (
        "Legend: "
        + "  ".join(parts)
        + "\nBar lengths are proportional to weighted frequency within each dataset."
    )



[docs]
def ascii_plot_bar(
    dfs: List[DataFrameWithWeight],
    names: List[str],
    column: str,
    weighted: bool = True,
    bar_width: Optional[int] = None,
    dist_type: Optional[str] = None,
    separate_categories: bool = True,
) -> str:
    """Produces an ASCII grouped barplot for a single categorical variable.

    Uses :func:`relative_frequency_table` to compute weighted proportions for
    each dataset, then renders grouped horizontal bars.

    How to read the output:
        Each row is a category value. Within a row, each dataset gets its
        own bar drawn with a distinct fill character (``█``, ``▓``, etc.).

        - The percentage at the end of each bar is the weighted proportion
          of that category within its dataset (i.e., proportions within
          each dataset sum to 100%).
        - Bar lengths are scaled so that the longest bar across all
          datasets spans the full ``bar_width``.

    Args:
        dfs: List of DataFrameWithWeight dicts.
        names: Names for each DataFrame (e.g., ["self", "target"]).
        column: The categorical column name to plot.
        weighted: Whether to use weights. Defaults to True.
        bar_width: Maximum character width for bars. Defaults to None,
            which auto-detects based on terminal width.
        dist_type: Accepted for compatibility but only "hist_ascii" is supported.
            A warning is logged if any other value is passed.
        separate_categories: If True, insert a blank line between categories
            for readability. Defaults to True.

    Returns:
        ASCII barplot text for this variable.

    Example:
        ::

            >>> df_a = pd.DataFrame({"color": ["red", "blue", "blue", "green"]})
            >>> df_b = pd.DataFrame({"color": ["red", "red", "blue", "green"]})
            >>> dfs = [
            ...     {"df": df_a, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])},
            ...     {"df": df_b, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])},
            ... ]
            >>> print(ascii_plot_bar(dfs, names=["self", "target"],
            ...       column="color", bar_width=20))
            === color (categorical) ===
            <BLANKLINE>
            Category | sample  population
                     |
            blue     | ████████████████████ (50.0%)
                     | ▒▒▒▒▒▒▒▒▒▒ (25.0%)
            <BLANKLINE>
            green    | ██████████ (25.0%)
                     | ▒▒▒▒▒▒▒▒▒▒ (25.0%)
            <BLANKLINE>
            red      | ██████████ (25.0%)
                     | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
            <BLANKLINE>
            Legend: █ sample  ▒ population
            Bar lengths are proportional to weighted frequency within each dataset.
            <BLANKLINE>
    """
    if dist_type is not None and dist_type != "hist_ascii":
        logger.warning(
            f"ASCII plots only support dist_type='hist_ascii'. "
            f"Ignoring dist_type='{dist_type}' and using 'hist_ascii'."
        )
    legend_names: List[str] = [naming_legend(n, names) for n in names]

    # Compute proportions per dataset
    all_props: List[pd.DataFrame] = []
    for ii, d in enumerate(dfs):
        a_series = d["df"][column]
        _w = d["weight"]
        if weighted and _w is not None:
            a_series, _w = rm_mutual_nas(a_series, _w)
            a_series.name = column
        freq_table = relative_frequency_table(a_series, w=_w if weighted else None)
        freq_table["dataset"] = legend_names[ii]
        all_props.append(freq_table)

    combined = pd.concat(all_props, ignore_index=True)

    # Get all unique categories in stable order
    categories: List[str] = list(combined[column].unique())

    # Find max proportion for bar scaling
    max_value: float = float(combined["prop"].max()) if len(combined) > 0 else 1.0

    # Compute label width
    label_width = max(len(str(c)) for c in categories) if categories else 8
    label_width = max(label_width, 8)  # minimum width for "Category"

    if bar_width is None:
        bar_width = _auto_bar_width(label_width)

    # Build output
    lines: List[str] = []
    lines.append(f"=== {column} (categorical) ===")
    lines.append("")

    # Header
    header_label = "Category".ljust(label_width)
    lines.append(f"{header_label} | {'  '.join(legend_names)}")
    lines.append(f"{' ' * label_width} |")

    for ci, cat in enumerate(categories):
        if separate_categories and ci > 0:
            lines.append("")
        cat_data = combined[combined[column] == cat]
        proportions: Dict[str, float] = {}
        for _, row in cat_data.iterrows():
            proportions[row["dataset"]] = float(row["prop"])
        lines.append(
            _render_horizontal_bars(
                str(cat), proportions, legend_names, bar_width, max_value, label_width
            )
        )

    lines.append("")
    lines.append(_build_legend(legend_names))
    lines.append("")

    return "\n".join(lines)




[docs]
def ascii_plot_hist(
    dfs: List[DataFrameWithWeight],
    names: List[str],
    column: str,
    weighted: bool = True,
    n_bins: Optional[int] = None,
    bar_width: Optional[int] = None,
    dist_type: Optional[str] = None,
) -> str:
    """Produces an ASCII histogram for a single numeric variable.

    Computes weighted histogram bins across all datasets using a shared
    bin range, then renders grouped horizontal bars for each bin.

    How to read the output:
        Each row is a numeric bin range. Within a row, each dataset gets
        its own bar drawn with a distinct fill character (``█``, ``▓``, etc.).

        - The percentage at the end of each bar is the weighted proportion
          of observations falling in that bin within its dataset (i.e.,
          proportions within each dataset sum to 100%).
        - Bar lengths are scaled so that the longest bar across all
          datasets spans the full ``bar_width``.

    Args:
        dfs: List of DataFrameWithWeight dicts.
        names: Names for each DataFrame (e.g., ["self", "target"]).
        column: The numeric column name to plot.
        weighted: Whether to use weights. Defaults to True.
        n_bins: Number of histogram bins. Defaults to None, which
            auto-detects using Sturges' rule.
        bar_width: Maximum character width for bars. Defaults to None,
            which auto-detects based on terminal width.
        dist_type: Accepted for compatibility but only "hist_ascii" is supported.
            A warning is logged if any other value is passed.

    Returns:
        ASCII histogram text for this variable.

    Example:
        ::

            >>> df_a = pd.DataFrame({"age": [10.0, 20.0, 30.0, 40.0]})
            >>> df_b = pd.DataFrame({"age": [10.0, 10.0, 10.0, 40.0]})
            >>> dfs = [
            ...     {"df": df_a, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])},
            ...     {"df": df_b, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])},
            ... ]
            >>> print(ascii_plot_hist(dfs, names=["self", "target"],
            ...       column="age", n_bins=2, bar_width=20))
            === age (numeric) ===
            <BLANKLINE>
            Bin            | sample  population
                           |
            [10.00, 25.00) | █████████████ (50.0%)
                           | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (75.0%)
            [25.00, 40.00] | █████████████ (50.0%)
                           | ▒▒▒▒▒▒▒ (25.0%)
            <BLANKLINE>
            Legend: █ sample  ▒ population
            Bar lengths are proportional to weighted frequency within each dataset.
            <BLANKLINE>
    """
    if dist_type is not None and dist_type != "hist_ascii":
        logger.warning(
            f"ASCII plots only support dist_type='hist_ascii'. "
            f"Ignoring dist_type='{dist_type}' and using 'hist_ascii'."
        )
    legend_names: List[str] = [naming_legend(n, names) for n in names]

    # Collect all values to determine shared bin range
    all_values: List[pd.Series] = []
    all_weights: List[Optional[pd.Series]] = []
    for d in dfs:
        a_series = d["df"][column]
        _w = d["weight"]
        if weighted and _w is not None:
            a_series, _w = rm_mutual_nas(a_series, _w)
        else:
            a_series = a_series.dropna()
        all_values.append(a_series)
        all_weights.append(_w if weighted else None)

    # Compute shared bin edges
    combined_values = pd.concat(all_values, ignore_index=True)
    if len(combined_values) == 0:
        return f"=== {column} (numeric) ===\n\nNo data available.\n"

    if n_bins is None:
        n_bins = _auto_n_bins(len(combined_values), combined_values.nunique())

    global_min = float(combined_values.min())
    global_max = float(combined_values.max())

    # Handle edge case where all values are the same
    if global_min == global_max:
        global_min = global_min - 0.5
        global_max = global_max + 0.5

    bin_edges = np.linspace(global_min, global_max, n_bins + 1)

    # Compute histograms per dataset
    hist_data: List[npt.NDArray[np.floating]] = []
    for vals, wts in zip(all_values, all_weights):
        hist_data.append(_weighted_histogram(vals, wts, bin_edges))

    # Build bin labels
    bin_labels: List[str] = []
    for i in range(n_bins):
        left = bin_edges[i]
        right = bin_edges[i + 1]
        bracket_right = "]" if i == n_bins - 1 else ")"
        bin_labels.append(f"[{left:,.2f}, {right:,.2f}{bracket_right}")

    # Find max proportion for bar scaling
    max_value: float = max(float(h.max()) for h in hist_data) if hist_data else 1.0

    # Compute label width
    label_width = max(len(lbl) for lbl in bin_labels) if bin_labels else 8
    label_width = max(label_width, 3)  # minimum width for "Bin"

    if bar_width is None:
        bar_width = _auto_bar_width(label_width)

    # Build output
    lines: List[str] = []
    lines.append(f"=== {column} (numeric) ===")
    lines.append("")

    # Header
    header_label = "Bin".ljust(label_width)
    lines.append(f"{header_label} | {'  '.join(legend_names)}")
    lines.append(f"{' ' * label_width} |")

    for bi, lbl in enumerate(bin_labels):
        proportions: Dict[str, float] = {}
        for di, name in enumerate(legend_names):
            proportions[name] = float(hist_data[di][bi])
        lines.append(
            _render_horizontal_bars(
                lbl, proportions, legend_names, bar_width, max_value, label_width
            )
        )

    lines.append("")
    lines.append(_build_legend(legend_names))
    lines.append("")

    return "\n".join(lines)




[docs]
def ascii_comparative_hist(
    dfs: List[DataFrameWithWeight],
    names: List[str],
    column: str,
    weighted: bool = True,
    n_bins: Optional[int] = None,
    bar_width: Optional[int] = None,
) -> str:
    """Produces a columnar, baseline-relative ASCII histogram.

    The first dataset is the baseline. Subsequent datasets show bars split
    into segments that indicate how each bin compares to the baseline.

    How to read the output:
        Each row is a bin range. The first column is the baseline dataset,
        shown with solid ``█`` bars. For every other column:

        - ``█`` (solid fill) = the portion of the bar that matches the
          baseline proportion. This is the "common" part.
        - ``▒`` (medium shade) = the portion that **exceeds** the baseline.
          The bin has more mass than the baseline in this range.
        - ``   ]`` (right bracket) = a **deficit** relative to the baseline.
          The gap before the bracket shows how much mass is missing compared to
          the baseline in this range.
        - A number without any bar means the percentage is too small to
          render at the chosen ``bar_width``.

        All percentages are normalized so each column sums to 100%.

    Args:
        dfs: List of DataFrameWithWeight dicts. The first entry is used as
            the baseline for comparison.
        names: Names for each DataFrame (e.g., ["Target", "Sample"]).
        column: The numeric column name to plot.
        weighted: Whether to use weights. Defaults to True.
        n_bins: Number of histogram bins. Defaults to None, which
            auto-detects using Sturges' rule.
        bar_width: Maximum character width for bars. Defaults to None,
            which auto-detects based on terminal width.

    Returns:
        ASCII comparative histogram text.

    Example:
        ::

            >>> print(ascii_comparative_hist(dfs, names=["Target", "Sample"],
            ...       column="income", n_bins=2, bar_width=20))
            === income (numeric, comparative) ===
            <BLANKLINE>
            Range          | Target (%)         | Sample (%)
            ---------------------------------------------------------------
            [10.00, 25.00) | █████████████ 50.0 | █████████████▒▒▒▒▒▒▒ 75.0
            [25.00, 40.00] | █████████████ 50.0 | ███████     ] 25.0
            ---------------------------------------------------------------
            Total          | 100.0              | 100.0

        In the Sample column above, bin [10, 25) shows ``▒`` excess
        (75% vs 50% baseline) while bin [25, 40] shows ``     ]``
        deficit (25% vs 50% baseline).
    """
    legend_names: List[str] = [naming_legend(n, names) for n in names]

    # Collect all values to determine shared bin range
    all_values: List[pd.Series] = []
    all_weights: List[Optional[pd.Series]] = []
    for d in dfs:
        a_series = d["df"][column]
        _w = d["weight"]
        if weighted and _w is not None:
            a_series, _w = rm_mutual_nas(a_series, _w)
        else:
            a_series = a_series.dropna()
        all_values.append(a_series)
        all_weights.append(_w if weighted else None)

    # Compute shared bin edges
    combined_values = pd.concat(all_values, ignore_index=True)
    if len(combined_values) == 0:
        return "No data available."

    if n_bins is None:
        n_bins = _auto_n_bins(len(combined_values), combined_values.nunique())

    global_min = float(combined_values.min())
    global_max = float(combined_values.max())

    if global_min == global_max:
        global_min = global_min - 0.5
        global_max = global_max + 0.5

    bin_edges = np.linspace(global_min, global_max, n_bins + 1)

    # Compute histograms per dataset (as percentages)
    hist_pcts: List[List[float]] = []
    for vals, wts in zip(all_values, all_weights):
        props = _weighted_histogram(vals, wts, bin_edges)
        hist_pcts.append([float(p) * 100.0 for p in props])

    # Find max percentage across all datasets and bins for bar scaling
    max_pct: float = max(
        (pct for pcts in hist_pcts for pct in pcts),
        default=0.0,
    )

    # Build bin labels
    bin_labels: List[str] = []
    for i in range(n_bins):
        left = bin_edges[i]
        right = bin_edges[i + 1]
        bracket_right = "]" if i == n_bins - 1 else ")"
        bin_labels.append(f"[{left:,.2f}, {right:,.2f}{bracket_right}")

    # Range column width (computed early so bar_width auto-detection can use it)
    range_header = "Range"
    range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels))

    if bar_width is None:
        bar_width = _auto_bar_width_columnar(range_width, len(legend_names))

    # Baseline percentages (first dataset)
    baseline_pcts = hist_pcts[0]

    # Build cell strings for each dataset x bin
    # cell_strings[dataset_idx][bin_idx] = "bar_chars pct"
    cell_strings: List[List[str]] = []
    for di in range(len(hist_pcts)):
        cells: List[str] = []
        for bi in range(n_bins):
            pct = hist_pcts[di][bi]
            if max_pct > 0:
                bar_len = round((pct / max_pct) * bar_width)
            else:
                bar_len = 0

            if di == 0:
                # Baseline: simple filled bars
                bar = "█" * bar_len
            else:
                base_pct = baseline_pcts[bi]
                if max_pct > 0:
                    baseline_len = round((base_pct / max_pct) * bar_width)
                else:
                    baseline_len = 0

                if bar_len >= baseline_len:
                    bar = "█" * baseline_len + "▒" * (bar_len - baseline_len)
                else:
                    deficit = baseline_len - bar_len
                    if deficit >= 2:
                        bar = "█" * bar_len + " " * (deficit - 1) + "]"
                    else:
                        bar = "█" * bar_len + "]"

            pct_str = f"{pct:.1f}"
            if bar:
                cells.append(f"{bar} {pct_str}")
            else:
                cells.append(pct_str)
        cell_strings.append(cells)

    # Compute column widths
    col_widths: List[int] = []
    for di in range(len(legend_names)):
        header_w = len(f"{legend_names[di]} (%)")
        max_cell_w = max(len(cell_strings[di][bi]) for bi in range(n_bins))
        col_widths.append(max(header_w, max_cell_w))

    # Build output
    lines: List[str] = []
    lines.append(f"=== {column} (numeric, comparative) ===")
    lines.append("")

    # Header row
    header_parts = [range_header.ljust(range_width)]
    for di in range(len(legend_names)):
        header_parts.append(f"{legend_names[di]} (%)".ljust(col_widths[di]))
    lines.append(" | ".join(header_parts))

    # Separator
    sep_width = range_width + sum(col_widths) + 3 * len(col_widths)
    lines.append("-" * sep_width)

    # Data rows
    for bi in range(n_bins):
        row_parts = [bin_labels[bi].ljust(range_width)]
        for di in range(len(legend_names)):
            row_parts.append(cell_strings[di][bi].ljust(col_widths[di]))
        lines.append(" | ".join(row_parts))

    # Separator
    lines.append("-" * sep_width)

    # Total row
    total_parts = ["Total".ljust(range_width)]
    for di in range(len(hist_pcts)):
        total_val = sum(hist_pcts[di])
        total_parts.append(f"{total_val:.1f}".ljust(col_widths[di]))
    lines.append(" | ".join(total_parts))

    # Legend (only when there are non-baseline columns)
    if len(legend_names) > 1:
        lines.append("")
        lines.append(
            f"Key: █ = shared with {legend_names[0]}," " ▒ = excess,    ] = deficit"
        )

    return "\n".join(lines)




[docs]
def ascii_plot_dist(
    dfs: List[DataFrameWithWeight],
    names: Optional[List[str]] = None,
    variables: Optional[List[str]] = None,
    numeric_n_values_threshold: int = 15,
    weighted: bool = True,
    n_bins: Optional[int] = None,
    bar_width: Optional[int] = None,
    dist_type: Optional[str] = None,
    separate_categories: bool = True,
    comparative: bool = True,
) -> str:
    """Produces ASCII text comparing weighted distributions across datasets.

    Iterates over variables, classifying each as categorical or numeric
    (using the same logic as :func:`seaborn_plot_dist`), then delegates to
    the appropriate plotting function.

    Two display modes are available for numeric variables:

    - **comparative** (``comparative=True``, the default): numeric variables
      are rendered with :func:`ascii_comparative_hist`, a columnar layout
      where the first dataset is the baseline and subsequent datasets show
      excess / deficit relative to it.
    - **grouped** (``comparative=False``): numeric variables are rendered
      with :func:`ascii_plot_hist`, a grouped-bar layout where each dataset
      gets its own bar per bin (the same style used for categorical
      variables).

    Categorical variables always use :func:`ascii_plot_bar` regardless of
    this setting.

    The output is both printed to stdout and returned as a string.

    Args:
        dfs: List of DataFrameWithWeight dicts.
        names: Names for each DataFrame (e.g., ["self", "unadjusted", "target"]).
            If None, defaults to "df_0", "df_1", etc.
        variables: Subset of variables to plot. None means all.
        numeric_n_values_threshold: Columns with fewer unique values than this
            are treated as categorical. Defaults to 15.
        weighted: Whether to use weights. Defaults to True.
        n_bins: Number of bins for numeric histograms. Defaults to None,
            which auto-detects using Sturges' rule.
        bar_width: Maximum character width for the longest bar. Defaults to
            None, which auto-detects based on terminal width.
        dist_type: Accepted for compatibility but only "hist_ascii" is supported.
            A warning is logged if any other value is passed.
        separate_categories: If True, insert a blank line between categories
            in barplots for readability. Defaults to True.
        comparative: If True (default), numeric variables use a columnar
            comparative histogram (:func:`ascii_comparative_hist`) that
            highlights differences relative to a baseline dataset.  If
            False, numeric variables use a grouped-bar histogram
            (:func:`ascii_plot_hist`) instead.

    Returns:
        The full ASCII output text.

    Examples:
        ::

            >>> import pandas as pd
            >>> from balance.stats_and_plots.ascii_plots import ascii_plot_dist
            >>> df_a = pd.DataFrame({
            ...     "color": ["red", "blue", "blue", "green"],
            ...     "age": [10.0, 20.0, 30.0, 40.0],
            ... })
            >>> df_b = pd.DataFrame({
            ...     "color": ["red", "red", "blue", "green"],
            ...     "age": [10.0, 10.0, 10.0, 40.0],
            ... })
            >>> dfs = [
            ...     {"df": df_a, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])},
            ...     {"df": df_b, "weight": pd.Series([1.0, 1.0, 1.0, 1.0])},
            ... ]
            >>> print(ascii_plot_dist(dfs, names=["self", "target"],
            ...       numeric_n_values_threshold=0, n_bins=2, bar_width=20))
            === color (categorical) ===
            <BLANKLINE>
            Category | population  sample
                     |
            blue     | ██████████ (25.0%)
                     | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
            <BLANKLINE>
            green    | ██████████ (25.0%)
                     | ▒▒▒▒▒▒▒▒▒▒ (25.0%)
            <BLANKLINE>
            red      | ████████████████████ (50.0%)
                     | ▒▒▒▒▒▒▒▒▒▒ (25.0%)
            <BLANKLINE>
            Legend: █ population  ▒ sample
            Bar lengths are proportional to weighted frequency within each dataset.
            <BLANKLINE>
            === age (numeric, comparative) ===
            <BLANKLINE>
            Range          | population (%)     | sample (%)
            ---------------------------------------------------------------
            [10.00, 25.00) | █████████████ 50.0 | █████████████▒▒▒▒▒▒▒ 75.0
            [25.00, 40.00] | █████████████ 50.0 | ███████     ] 25.0
            ---------------------------------------------------------------
            Total          | 100.0              | 100.0
            <BLANKLINE>
            Key: █ = shared with population, ▒ = excess,    ] = deficit

        To use grouped-bar histograms (same style as categorical) instead
        of comparative histograms for numeric variables, pass
        ``comparative=False``::

            >>> print(ascii_plot_dist(dfs, names=["self", "target"],
            ...       numeric_n_values_threshold=0, n_bins=2, bar_width=20,
            ...       comparative=False))
            === color (categorical) ===
            ...
            === age (numeric) ===
            <BLANKLINE>
            Bin            | population  sample
                           |
            [10.00, 25.00) | ████████████████████ (75.0%)
                           | ▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
            [25.00, 40.00] | ███████ (25.0%)
                           | ▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
            <BLANKLINE>
            Legend: █ population  ▒ sample
            Bar lengths are proportional to weighted frequency within each dataset.
            <BLANKLINE>
    """
    if dist_type is not None and dist_type != "hist_ascii":
        logger.warning(
            f"ASCII plots only support dist_type='hist_ascii'. "
            f"Ignoring dist_type='{dist_type}' and using 'hist_ascii'."
        )
    if names is None:
        names = [f"df_{i}" for i in range(len(dfs))]

    # Reorder so comparative plots show: population, adjusted, sample
    dfs, names = _reorder_dfs_and_names(dfs, names)

    variables = choose_variables(*(d["df"] for d in dfs), variables=variables)
    logger.debug(f"ASCII plotting variables {variables}")

    numeric_variables = dfs[0]["df"].select_dtypes(exclude=["object"]).columns.values

    output_parts: List[str] = []

    for o in variables:
        # Find the maximum number of non-missing unique values across all dfs
        n_values = max(len(set(rm_mutual_nas(d["df"].loc[:, o].values))) for d in dfs)
        if n_values == 0:
            logger.warning(f"No nonmissing values for variable '{o}', skipping")
            continue

        categorical = (o not in numeric_variables) or (
            n_values < numeric_n_values_threshold
        )

        if categorical:
            output_parts.append(
                ascii_plot_bar(
                    dfs,
                    names,
                    o,
                    weighted=weighted,
                    bar_width=bar_width,
                    separate_categories=separate_categories,
                )
            )
        else:
            if comparative:
                output_parts.append(
                    ascii_comparative_hist(
                        dfs,
                        names,
                        o,
                        weighted=weighted,
                        n_bins=n_bins,
                        bar_width=bar_width,
                    )
                )
            else:
                output_parts.append(
                    ascii_plot_hist(
                        dfs,
                        names,
                        o,
                        weighted=weighted,
                        n_bins=n_bins,
                        bar_width=bar_width,
                    )
                )

    result = "\n".join(output_parts)
    print(result)
    return result