From 30280c6a64902290ef812e59add88b44d5603ae8 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Fri, 20 Mar 2026 23:25:52 +0000 Subject: [PATCH 1/2] #178: New actual/implied hhp balancing methodology --- python/hh_characteristics.py | 203 +++++++++++++++++++++++++---------- 1 file changed, 146 insertions(+), 57 deletions(-) diff --git a/python/hh_characteristics.py b/python/hh_characteristics.py index 101891a..b4e0f86 100644 --- a/python/hh_characteristics.py +++ b/python/hh_characteristics.py @@ -328,68 +328,157 @@ def _create_hh_size( group[utils.HOUSEHOLD_SIZES] = controlled_data controlled_groups.append(group) - # Coerce the controlled data back into the original format. Note that pd.melt() - # does not include the "tract" column, which functionally drops it + # Recombine all the data in preparation for the next controlling step + hh_size = pd.concat(controlled_groups) + + # For every MGRA, compute the minimum and maximum implied household population. The + # maximum assumes that every household in the 7+ category is of size 11, which is + # what we get from looking at the San Diego region PUMS data. See GitHub for more + # info: https://github.com/SANDAG/Estimates-Program/issues/112 + n_people_in_7_plus = 11 hh_size = ( - pd.concat(controlled_groups).melt( - id_vars=["run_id", "year", "mgra"], - value_vars=utils.HOUSEHOLD_SIZES, - var_name="household_size", - value_name="hh", + hh_size.merge(mgra_controls, on=["run_id", "year", "mgra"], how="left") + .astype({hhs: int for hhs in utils.HOUSEHOLD_SIZES}) + .assign( + min_implied_hhp=lambda df: df[1] + + (2 * df[2]) + + (3 * df[3]) + + (4 * df[4]) + + (5 * df[5]) + + (6 * df[6]) + + (7 * df[7]), + max_implied_hhp=lambda df: df[1] + + (2 * df[2]) + + (3 * df[3]) + + (4 * df[4]) + + (5 * df[5]) + + (6 * df[6]) + + (n_people_in_7_plus * df[7]), + # Then, compute the difference between the actual household population and the + # implied minimum and maximum. The two following columns are instructions for + # how to adjust the households by size distribution + decrease_min=lambda df: np.where( + df["min_implied_hhp"] > df["hhp_total"], + df["min_implied_hhp"] - df["hhp_total"], + 0, + ), + increase_max=lambda df: np.where( + df["max_implied_hhp"] < df["hhp_total"], + df["hhp_total"] - df["max_implied_hhp"], + 0, + ), ) - # For some reason, melt forces the household_size column to object type, even - # though all values are integer - .astype(int) ) - # Control each MGRA to align with household population - controlled_groups = [] - for mgra, group in hh_size.groupby("mgra"): - control = mgra_controls[mgra_controls["mgra"] == mgra] - hhp_total = control["hhp_total"].values[0] - - # Compute the minimum and maximum implied hhp from the hhs distribution. The - # maximum assumes that every household in the 7+ category is of size 11, which - # is what we get from looking at the San Diego region PUMS data. See GitHub - # for more info: https://github.com/SANDAG/Estimates-Program/issues/112 - n_people_in_7_plus = 11 - min_implied_hhp = (group["hh"] * group["household_size"]).sum() - max_implied_hhp = ( - group["hh"] * group["household_size"].replace(7, n_people_in_7_plus) - ).sum() - - # If the maximum implied hhp is smaller than the actual hhp, then we need to - # shift some households from smaller sizes to larger sizes. Specifically, we - # will shift one household from 1-->2, 2-->3, 6-->7+, 1-->2, etc. until - # satisfied - if max_implied_hhp < hhp_total: - size_to_change = 1 - while max_implied_hhp < hhp_total: - if group[group["household_size"] == size_to_change]["hh"].values[0] > 0: - group.loc[group["household_size"] == size_to_change, "hh"] -= 1 - group.loc[group["household_size"] == size_to_change + 1, "hh"] += 1 - max_implied_hhp += 1 - if size_to_change == 6: - max_implied_hhp += n_people_in_7_plus - 7 - # Increase the size by one, but keep it in the inclusive range 1-6 - size_to_change = (size_to_change % 6) + 1 - - # If the minimum implied hhp is greater than the actual hhp, then we need to - # shift some households from larger sizes to smaller sizes - if min_implied_hhp > hhp_total: - size_to_change = 7 - while min_implied_hhp > hhp_total: - if group[group["household_size"] == size_to_change]["hh"].values[0] > 0: - group.loc[group["household_size"] == size_to_change, "hh"] -= 1 - group.loc[group["household_size"] == size_to_change - 1, "hh"] += 1 - min_implied_hhp -= 1 - # Decrease size by one, but keep it in the inclusive range 2-7 - size_to_change = (size_to_change - 3) % 6 + 2 - - # Store the controlled group - controlled_groups.append(group) + # The methodolgy to adjust each individual MGRA. Unfortunately, I don't believe + # there's any way to do this in parallel manner, given how each MGRA has a different + # distribution and different change requirements + def adjust_mgra(mgra_data: pd.Series) -> pd.Series: + if mgra_data["increase_max"] == 0 and mgra_data["decrease_min"] == 0: + return mgra_data + while True: + # Choose a random household size to decrease, weighted by the number of + # households in the size. Note the weighting ensures that a household size + # with non-zero households will always be chosen + hhs_to_decrease = generator.choice( + utils.HOUSEHOLD_SIZES, + p=mgra_data[utils.HOUSEHOLD_SIZES] + / mgra_data[utils.HOUSEHOLD_SIZES].sum(), + ) - return {"hh_size": pd.concat(controlled_groups)} + # Depending on if we want to increase the max implied hhp or decrease the + # min implied hhp, we look above or below the household size to decrease. + # But we don't want to go too far above/below, or we could overshot then + # have to do the opposite adjustment + if mgra_data["increase_max"] > 0 and hhs_to_decrease != 7: + hhs_to_increase = generator.choice( + range( + hhs_to_decrease + 1, + np.min([hhs_to_decrease + mgra_data["increase_max"], 7]) + 1, + ) + ) + elif mgra_data["decrease_min"] > 0 and hhs_to_decrease != 1: + hhs_to_increase = generator.choice( + range( + np.max([1, hhs_to_decrease - mgra_data["decrease_min"]]), + hhs_to_decrease, + ) + ) + else: + continue + + # Execute the change and recompute the remaining change needed + mgra_data[hhs_to_decrease] -= 1 + mgra_data[hhs_to_increase] += 1 + if mgra_data["increase_max"] > 0: + mgra_data["increase_max"] -= (hhs_to_increase - hhs_to_decrease) + ( + n_people_in_7_plus - 7 if hhs_to_increase == 7 else 0 + ) + else: + mgra_data["decrease_min"] += hhs_to_increase - hhs_to_decrease + + # Check if we are done with this MGRA. Note, the "increase_max" is allowed + # to overshoot since if the increase size is seven, we actually change the + # implied hhp by an additional four + if mgra_data["increase_max"] <= 0 and mgra_data["decrease_min"] == 0: + return mgra_data + + # Apply the MGRA adjustments + hh_size = hh_size.apply(adjust_mgra, axis=1) + + # Double check that implied and actual household population are now correctly + # aligned + hh_size = hh_size.assign( + min_implied_hhp=lambda df: df[1] + + (2 * df[2]) + + (3 * df[3]) + + (4 * df[4]) + + (5 * df[5]) + + (6 * df[6]) + + (7 * df[7]), + max_implied_hhp=lambda df: df[1] + + (2 * df[2]) + + (3 * df[3]) + + (4 * df[4]) + + (5 * df[5]) + + (6 * df[6]) + + (n_people_in_7_plus * df[7]), + # Then, compute the difference between the actual household population and the + # implied minimum and maximum. The two following columns are instructions for + # how to adjust the households by size distribution + decrease_min=lambda df: np.where( + df["min_implied_hhp"] > df["hhp_total"], + df["min_implied_hhp"] - df["hhp_total"], + 0, + ), + increase_max=lambda df: np.where( + df["max_implied_hhp"] < df["hhp_total"], + df["hhp_total"] - df["max_implied_hhp"], + 0, + ), + ) + if (hh_size["decrease_min"] != 0).any() or (hh_size["increase_max"] != 0).any(): + raise ValueError( + "Alignment between actual and implied household popuation failed." + ) + + # Reshape and return + return { + "hh_size": hh_size.drop( + columns=[ + "tract", + "hhp_total", + "min_implied_hhp", + "max_implied_hhp", + "decrease_min", + "increase_max", + ] + ).melt( + id_vars=["run_id", "year", "mgra"], + var_name="household_size", + value_name="hh", + ) + } def _validate_hh_income_outputs(hh_income_outputs: dict[str, pd.DataFrame]) -> None: From 5dde2af4c49c1f69a1421873d06d1cbcdc20cd2c Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Fri, 20 Mar 2026 23:41:57 +0000 Subject: [PATCH 2/2] #178: Fixed spelling --- python/hh_characteristics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/hh_characteristics.py b/python/hh_characteristics.py index b4e0f86..c20e111 100644 --- a/python/hh_characteristics.py +++ b/python/hh_characteristics.py @@ -370,7 +370,7 @@ def _create_hh_size( ) ) - # The methodolgy to adjust each individual MGRA. Unfortunately, I don't believe + # The methodology to adjust each individual MGRA. Unfortunately, I don't believe # there's any way to do this in parallel manner, given how each MGRA has a different # distribution and different change requirements def adjust_mgra(mgra_data: pd.Series) -> pd.Series: @@ -388,7 +388,7 @@ def adjust_mgra(mgra_data: pd.Series) -> pd.Series: # Depending on if we want to increase the max implied hhp or decrease the # min implied hhp, we look above or below the household size to decrease. - # But we don't want to go too far above/below, or we could overshot then + # But we don't want to go too far above/below, or we could overshoot then # have to do the opposite adjustment if mgra_data["increase_max"] > 0 and hhs_to_decrease != 7: hhs_to_increase = generator.choice( @@ -459,7 +459,7 @@ def adjust_mgra(mgra_data: pd.Series) -> pd.Series: ) if (hh_size["decrease_min"] != 0).any() or (hh_size["increase_max"] != 0).any(): raise ValueError( - "Alignment between actual and implied household popuation failed." + "Alignment between actual and implied household population failed." ) # Reshape and return