Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 146 additions & 57 deletions python/hh_characteristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,68 +328,157 @@ def _create_hh_size(
group[utils.HOUSEHOLD_SIZES] = controlled_data
controlled_groups.append(group)

# Coerce the controlled data back into the original format. Note that pd.melt()
# does not include the "tract" column, which functionally drops it
# Recombine all the data in preparation for the next controlling step
hh_size = pd.concat(controlled_groups)

# For every MGRA, compute the minimum and maximum implied household population. The
# maximum assumes that every household in the 7+ category is of size 11, which is
# what we get from looking at the San Diego region PUMS data. See GitHub for more
# info: https://github.com/SANDAG/Estimates-Program/issues/112
n_people_in_7_plus = 11
hh_size = (
pd.concat(controlled_groups).melt(
id_vars=["run_id", "year", "mgra"],
value_vars=utils.HOUSEHOLD_SIZES,
var_name="household_size",
value_name="hh",
hh_size.merge(mgra_controls, on=["run_id", "year", "mgra"], how="left")
.astype({hhs: int for hhs in utils.HOUSEHOLD_SIZES})
.assign(
min_implied_hhp=lambda df: df[1]
+ (2 * df[2])
+ (3 * df[3])
+ (4 * df[4])
+ (5 * df[5])
+ (6 * df[6])
+ (7 * df[7]),
max_implied_hhp=lambda df: df[1]
+ (2 * df[2])
+ (3 * df[3])
+ (4 * df[4])
+ (5 * df[5])
+ (6 * df[6])
+ (n_people_in_7_plus * df[7]),
# Then, compute the difference between the actual household population and the
# implied minimum and maximum. The two following columns are instructions for
# how to adjust the households by size distribution
decrease_min=lambda df: np.where(
df["min_implied_hhp"] > df["hhp_total"],
df["min_implied_hhp"] - df["hhp_total"],
0,
),
increase_max=lambda df: np.where(
df["max_implied_hhp"] < df["hhp_total"],
df["hhp_total"] - df["max_implied_hhp"],
0,
),
)
# For some reason, melt forces the household_size column to object type, even
# though all values are integer
.astype(int)
)

# Control each MGRA to align with household population
controlled_groups = []
for mgra, group in hh_size.groupby("mgra"):
control = mgra_controls[mgra_controls["mgra"] == mgra]
hhp_total = control["hhp_total"].values[0]

# Compute the minimum and maximum implied hhp from the hhs distribution. The
# maximum assumes that every household in the 7+ category is of size 11, which
# is what we get from looking at the San Diego region PUMS data. See GitHub
# for more info: https://github.com/SANDAG/Estimates-Program/issues/112
n_people_in_7_plus = 11
min_implied_hhp = (group["hh"] * group["household_size"]).sum()
max_implied_hhp = (
group["hh"] * group["household_size"].replace(7, n_people_in_7_plus)
).sum()

# If the maximum implied hhp is smaller than the actual hhp, then we need to
# shift some households from smaller sizes to larger sizes. Specifically, we
# will shift one household from 1-->2, 2-->3, 6-->7+, 1-->2, etc. until
# satisfied
if max_implied_hhp < hhp_total:
size_to_change = 1
while max_implied_hhp < hhp_total:
if group[group["household_size"] == size_to_change]["hh"].values[0] > 0:
group.loc[group["household_size"] == size_to_change, "hh"] -= 1
group.loc[group["household_size"] == size_to_change + 1, "hh"] += 1
max_implied_hhp += 1
if size_to_change == 6:
max_implied_hhp += n_people_in_7_plus - 7
# Increase the size by one, but keep it in the inclusive range 1-6
size_to_change = (size_to_change % 6) + 1

# If the minimum implied hhp is greater than the actual hhp, then we need to
# shift some households from larger sizes to smaller sizes
if min_implied_hhp > hhp_total:
size_to_change = 7
while min_implied_hhp > hhp_total:
if group[group["household_size"] == size_to_change]["hh"].values[0] > 0:
group.loc[group["household_size"] == size_to_change, "hh"] -= 1
group.loc[group["household_size"] == size_to_change - 1, "hh"] += 1
min_implied_hhp -= 1
# Decrease size by one, but keep it in the inclusive range 2-7
size_to_change = (size_to_change - 3) % 6 + 2

# Store the controlled group
controlled_groups.append(group)
# The methodology to adjust each individual MGRA. Unfortunately, I don't believe
# there's any way to do this in parallel manner, given how each MGRA has a different
# distribution and different change requirements
def adjust_mgra(mgra_data: pd.Series) -> pd.Series:
if mgra_data["increase_max"] == 0 and mgra_data["decrease_min"] == 0:
return mgra_data
while True:
# Choose a random household size to decrease, weighted by the number of
# households in the size. Note the weighting ensures that a household size
# with non-zero households will always be chosen
hhs_to_decrease = generator.choice(
utils.HOUSEHOLD_SIZES,
p=mgra_data[utils.HOUSEHOLD_SIZES]
/ mgra_data[utils.HOUSEHOLD_SIZES].sum(),
)

return {"hh_size": pd.concat(controlled_groups)}
# Depending on if we want to increase the max implied hhp or decrease the
# min implied hhp, we look above or below the household size to decrease.
# But we don't want to go too far above/below, or we could overshoot then
# have to do the opposite adjustment
if mgra_data["increase_max"] > 0 and hhs_to_decrease != 7:
hhs_to_increase = generator.choice(
range(
hhs_to_decrease + 1,
np.min([hhs_to_decrease + mgra_data["increase_max"], 7]) + 1,
)
)
elif mgra_data["decrease_min"] > 0 and hhs_to_decrease != 1:
hhs_to_increase = generator.choice(
range(
np.max([1, hhs_to_decrease - mgra_data["decrease_min"]]),
hhs_to_decrease,
)
)
else:
continue

# Execute the change and recompute the remaining change needed
mgra_data[hhs_to_decrease] -= 1
mgra_data[hhs_to_increase] += 1
if mgra_data["increase_max"] > 0:
mgra_data["increase_max"] -= (hhs_to_increase - hhs_to_decrease) + (
n_people_in_7_plus - 7 if hhs_to_increase == 7 else 0
)
else:
mgra_data["decrease_min"] += hhs_to_increase - hhs_to_decrease

# Check if we are done with this MGRA. Note, the "increase_max" is allowed
# to overshoot since if the increase size is seven, we actually change the
# implied hhp by an additional four
if mgra_data["increase_max"] <= 0 and mgra_data["decrease_min"] == 0:
return mgra_data

# Apply the MGRA adjustments
hh_size = hh_size.apply(adjust_mgra, axis=1)

# Double check that implied and actual household population are now correctly
# aligned
hh_size = hh_size.assign(
min_implied_hhp=lambda df: df[1]
+ (2 * df[2])
+ (3 * df[3])
+ (4 * df[4])
+ (5 * df[5])
+ (6 * df[6])
+ (7 * df[7]),
max_implied_hhp=lambda df: df[1]
+ (2 * df[2])
+ (3 * df[3])
+ (4 * df[4])
+ (5 * df[5])
+ (6 * df[6])
+ (n_people_in_7_plus * df[7]),
# Then, compute the difference between the actual household population and the
# implied minimum and maximum. The two following columns are instructions for
# how to adjust the households by size distribution
decrease_min=lambda df: np.where(
df["min_implied_hhp"] > df["hhp_total"],
df["min_implied_hhp"] - df["hhp_total"],
0,
),
increase_max=lambda df: np.where(
df["max_implied_hhp"] < df["hhp_total"],
df["hhp_total"] - df["max_implied_hhp"],
0,
),
)
if (hh_size["decrease_min"] != 0).any() or (hh_size["increase_max"] != 0).any():
raise ValueError(
"Alignment between actual and implied household population failed."
)

# Reshape and return
return {
"hh_size": hh_size.drop(
columns=[
"tract",
"hhp_total",
"min_implied_hhp",
"max_implied_hhp",
"decrease_min",
"increase_max",
]
).melt(
id_vars=["run_id", "year", "mgra"],
var_name="household_size",
value_name="hh",
)
}


def _validate_hh_income_outputs(hh_income_outputs: dict[str, pd.DataFrame]) -> None:
Expand Down
Loading