SANDAG · Eric-Liu-SANDAG · Mar 20, 2026 · Mar 20, 2026
diff --git a/python/hh_characteristics.py b/python/hh_characteristics.py
@@ -328,68 +328,157 @@ def _create_hh_size(
         group[utils.HOUSEHOLD_SIZES] = controlled_data
         controlled_groups.append(group)
 
-    # Coerce the controlled data back into the original format. Note that pd.melt()
-    # does not include the "tract" column, which functionally drops it
+    # Recombine all the data in preparation for the next controlling step
+    hh_size = pd.concat(controlled_groups)
+
+    # For every MGRA, compute the minimum and maximum implied household population. The
+    # maximum assumes that every household in the 7+ category is of size 11, which is
+    # what we get from looking at the San Diego region PUMS data. See GitHub for more
+    # info: https://github.com/SANDAG/Estimates-Program/issues/112
+    n_people_in_7_plus = 11
     hh_size = (
-        pd.concat(controlled_groups).melt(
-            id_vars=["run_id", "year", "mgra"],
-            value_vars=utils.HOUSEHOLD_SIZES,
-            var_name="household_size",
-            value_name="hh",
+        hh_size.merge(mgra_controls, on=["run_id", "year", "mgra"], how="left")
+        .astype({hhs: int for hhs in utils.HOUSEHOLD_SIZES})
+        .assign(
+            min_implied_hhp=lambda df: df[1]
+            + (2 * df[2])
+            + (3 * df[3])
+            + (4 * df[4])
+            + (5 * df[5])
+            + (6 * df[6])
+            + (7 * df[7]),
+            max_implied_hhp=lambda df: df[1]
+            + (2 * df[2])
+            + (3 * df[3])
+            + (4 * df[4])
+            + (5 * df[5])
+            + (6 * df[6])
+            + (n_people_in_7_plus * df[7]),
+            # Then, compute the difference between the actual household population and the
+            # implied minimum and maximum. The two following columns are instructions for
+            # how to adjust the households by size distribution
+            decrease_min=lambda df: np.where(
+                df["min_implied_hhp"] > df["hhp_total"],
+                df["min_implied_hhp"] - df["hhp_total"],
+                0,
+            ),
+            increase_max=lambda df: np.where(
+                df["max_implied_hhp"] < df["hhp_total"],
+                df["hhp_total"] - df["max_implied_hhp"],
+                0,
+            ),
         )
-        # For some reason, melt forces the household_size column to object type, even
-        # though all values are integer
-        .astype(int)
     )
 
-    # Control each MGRA to align with household population
-    controlled_groups = []
-    for mgra, group in hh_size.groupby("mgra"):
-        control = mgra_controls[mgra_controls["mgra"] == mgra]
-        hhp_total = control["hhp_total"].values[0]
-
-        # Compute the minimum and maximum implied hhp from the hhs distribution. The
-        # maximum assumes that every household in the 7+ category is of size 11, which
-        # is what we get from looking at the San Diego region PUMS data. See GitHub
-        # for more info: https://github.com/SANDAG/Estimates-Program/issues/112
-        n_people_in_7_plus = 11
-        min_implied_hhp = (group["hh"] * group["household_size"]).sum()
-        max_implied_hhp = (
-            group["hh"] * group["household_size"].replace(7, n_people_in_7_plus)
-        ).sum()
-
-        # If the maximum implied hhp is smaller than the actual hhp, then we need to
-        # shift some households from smaller sizes to larger sizes. Specifically, we
-        # will shift one household from 1-->2, 2-->3, 6-->7+, 1-->2, etc. until
-        # satisfied
-        if max_implied_hhp < hhp_total:
-            size_to_change = 1
-            while max_implied_hhp < hhp_total:
-                if group[group["household_size"] == size_to_change]["hh"].values[0] > 0:
-                    group.loc[group["household_size"] == size_to_change, "hh"] -= 1
-                    group.loc[group["household_size"] == size_to_change + 1, "hh"] += 1
-                    max_implied_hhp += 1
-                    if size_to_change == 6:
-                        max_implied_hhp += n_people_in_7_plus - 7
-                # Increase the size by one, but keep it in the inclusive range 1-6
-                size_to_change = (size_to_change % 6) + 1
-
-        # If the minimum implied hhp is greater than the actual hhp, then we need to
-        # shift some households from larger sizes to smaller sizes
-        if min_implied_hhp > hhp_total:
-            size_to_change = 7
-            while min_implied_hhp > hhp_total:
-                if group[group["household_size"] == size_to_change]["hh"].values[0] > 0:
-                    group.loc[group["household_size"] == size_to_change, "hh"] -= 1
-                    group.loc[group["household_size"] == size_to_change - 1, "hh"] += 1
-                    min_implied_hhp -= 1
-                # Decrease size by one, but keep it in the inclusive range 2-7
-                size_to_change = (size_to_change - 3) % 6 + 2
-
-        # Store the controlled group
-        controlled_groups.append(group)
+    # The methodology to adjust each individual MGRA. Unfortunately, I don't believe
+    # there's any way to do this in parallel manner, given how each MGRA has a different
+    # distribution and different change requirements
+    def adjust_mgra(mgra_data: pd.Series) -> pd.Series:
+        if mgra_data["increase_max"] == 0 and mgra_data["decrease_min"] == 0:
+            return mgra_data
+        while True:
+            # Choose a random household size to decrease, weighted by the number of
+            # households in the size. Note the weighting ensures that a household size
+            # with non-zero households will always be chosen
+            hhs_to_decrease = generator.choice(
+                utils.HOUSEHOLD_SIZES,
+                p=mgra_data[utils.HOUSEHOLD_SIZES]
+                / mgra_data[utils.HOUSEHOLD_SIZES].sum(),
+            )
 
-    return {"hh_size": pd.concat(controlled_groups)}
+            # Depending on if we want to increase the max implied hhp or decrease the
+            # min implied hhp, we look above or below the household size to decrease.
+            # But we don't want to go too far above/below, or we could overshoot then
+            # have to do the opposite adjustment
+            if mgra_data["increase_max"] > 0 and hhs_to_decrease != 7:
+                hhs_to_increase = generator.choice(
+                    range(
+                        hhs_to_decrease + 1,
+                        np.min([hhs_to_decrease + mgra_data["increase_max"], 7]) + 1,
+                    )
+                )
+            elif mgra_data["decrease_min"] > 0 and hhs_to_decrease != 1:
+                hhs_to_increase = generator.choice(
+                    range(
+                        np.max([1, hhs_to_decrease - mgra_data["decrease_min"]]),
+                        hhs_to_decrease,
+                    )
+                )
+            else:
+                continue
+
+            # Execute the change and recompute the remaining change needed
+            mgra_data[hhs_to_decrease] -= 1
+            mgra_data[hhs_to_increase] += 1
+            if mgra_data["increase_max"] > 0:
+                mgra_data["increase_max"] -= (hhs_to_increase - hhs_to_decrease) + (
+                    n_people_in_7_plus - 7 if hhs_to_increase == 7 else 0
+                )
+            else:
+                mgra_data["decrease_min"] += hhs_to_increase - hhs_to_decrease
+
+            # Check if we are done with this MGRA. Note, the "increase_max" is allowed
+            # to overshoot since if the increase size is seven, we actually change the
+            # implied hhp by an additional four
+            if mgra_data["increase_max"] <= 0 and mgra_data["decrease_min"] == 0:
+                return mgra_data
+
+    # Apply the MGRA adjustments
+    hh_size = hh_size.apply(adjust_mgra, axis=1)
+
+    # Double check that implied and actual household population are now correctly
+    # aligned
+    hh_size = hh_size.assign(
+        min_implied_hhp=lambda df: df[1]
+        + (2 * df[2])
+        + (3 * df[3])
+        + (4 * df[4])
+        + (5 * df[5])
+        + (6 * df[6])
+        + (7 * df[7]),
+        max_implied_hhp=lambda df: df[1]
+        + (2 * df[2])
+        + (3 * df[3])
+        + (4 * df[4])
+        + (5 * df[5])
+        + (6 * df[6])
+        + (n_people_in_7_plus * df[7]),
+        # Then, compute the difference between the actual household population and the
+        # implied minimum and maximum. The two following columns are instructions for
+        # how to adjust the households by size distribution
+        decrease_min=lambda df: np.where(
+            df["min_implied_hhp"] > df["hhp_total"],
+            df["min_implied_hhp"] - df["hhp_total"],
+            0,
+        ),
+        increase_max=lambda df: np.where(
+            df["max_implied_hhp"] < df["hhp_total"],
+            df["hhp_total"] - df["max_implied_hhp"],
+            0,
+        ),
+    )
+    if (hh_size["decrease_min"] != 0).any() or (hh_size["increase_max"] != 0).any():
+        raise ValueError(
+            "Alignment between actual and implied household population failed."
+        )
+
+    # Reshape and return
+    return {
+        "hh_size": hh_size.drop(
+            columns=[
+                "tract",
+                "hhp_total",
+                "min_implied_hhp",
+                "max_implied_hhp",
+                "decrease_min",
+                "increase_max",
+            ]
+        ).melt(
+            id_vars=["run_id", "year", "mgra"],
+            var_name="household_size",
+            value_name="hh",
+        )
+    }
 
 
 def _validate_hh_income_outputs(hh_income_outputs: dict[str, pd.DataFrame]) -> None: