From 71353649a34e129e027cda012a37a9d82dea6124 Mon Sep 17 00:00:00 2001
From: sunj2 <sunj2@pbworld.com>
Date: Fri, 5 Jan 2024 14:11:20 -0800
Subject: [PATCH] updated HTM script for SANDAG ABM3 integration

---
 run_htm.py | 538 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 538 insertions(+)
 create mode 100644 run_htm.py
diff --git a/run_htm.py b/run_htm.py
new file mode 100644
index 0000000..41ee936
--- /dev/null
+++ b/run_htm.py
@@ -0,0 +1,538 @@
+# -*- coding: utf-8 -*-
+"""
+Heavy Truck Model
+FAF Annual Tonnage to Truck Number by TAZ Disaggregation
+
+Author: Maddie Hasani, Fehr & Peers <br/>
+Reviewer: Fatemeh Ranaiefar, Fehr & Peers<br/>
+Last update: 12/19/2023
+"""
+
+## REQUIRED LIBRARIES
+import pandas as pd
+import numpy as np
+import openmatrix as omx
+from scipy import io
+pd.set_option('display.max_columns', None)  # Display all columns
+import sys
+import shutil
+
+#===================================================================================================================
+#===================================================================================================================
+
+
+## Functions
+def load_and_preprocess_data(path, model_dir, output_dir, faf_file_name, mgra_file_name, skim_file_name):
+
+    """
+    Parameters:
+    - Path: a full path to the input file. For example: 'C:\GitLab\HVM scripts\input_file.xlsx'
+
+    input_file:
+    - HTM input excel file
+
+    Returns:
+    - df_dict (dict): A dictionary containing DataFrames for each sheet in the input Excel file.
+    - traffic_skims (DataFrame): Processed DataFrame containing traffic skims.
+
+    Description:
+    This function loads and preprocesses input files and returns DataFrames and processed traffic skims.
+
+    """
+
+    # Initialize a dictionary to hold DataFrames
+    df_dict = {}
+
+    # 1. Load in input files
+    input_path = path
+    inputs_sandag_HTM = pd.ExcelFile(input_path)
+    sheet_names = [sheet_name for sheet_name in inputs_sandag_HTM.sheet_names if sheet_name.lower() not in ['userguide', 'reference']]
+
+    # 1.1 Load all sheets into separate DataFrames with lowercase names
+    for sheet_name in sheet_names:
+        df_name = sheet_name.lower()  # Convert sheet name to lowercase
+        df_dict[df_name] = inputs_sandag_HTM.parse(sheet_name)  # Save DataFrame to the dictionary
+
+    # 2. Load in FAF data
+    faf = df_dict['faf']  # Use the dictionary to get the DataFrame
+    faf_name = faf_file_name
+    faf_path = model_dir + "\input\htm"
+    full_faf_path = f"{faf_path}\\{faf_name}"
+    df = pd.read_csv(full_faf_path)
+
+    # 3. Load in MGRA data
+    mgra_loc = df_dict['mgra']  # Use the dictionary to get the DataFrame
+    mgra_name = mgra_file_name
+    #mgra_path = mgra_loc.loc[0, 'Path']
+    full_mgra_path = f"{model_dir}\\{mgra_file_name}"
+    mgra = pd.read_csv(full_mgra_path)
+
+    # 4. Load in Skim file
+    skim = df_dict['skim']  # Use the dictionary to get the DataFrame
+    skim_name = skim_file_name
+    skim_path = output_dir
+    full_skim_path = f"{output_dir}\\skims\\{skim_name}.omx"
+
+    traffic_skims_PM = []
+    with omx.open_file(full_skim_path) as omx_file:
+        # for name in ["PM_TRK_H_DIST"]:
+        for name in ["TRK_H_DIST__PM"]:
+            matrix = omx_file[name]
+            df_skim = pd.DataFrame(matrix[:])
+            stacked_df = df_skim.stack().reset_index()  # Reset the index to make it separate columns
+            stacked_df.columns = ["origin", "destination", name]  # Rename the columns
+            traffic_skims_PM.append(stacked_df)
+    traffic_skims = pd.concat(traffic_skims_PM, axis=1)
+
+    # 5. Filter out OD pairs where at least one end is a gateway
+    traffic_skims = traffic_skims.loc[(traffic_skims['origin'] >= 13) & (traffic_skims['destination'] >= 13)]
+
+    return df, mgra, traffic_skims, df_dict
+
+
+#===================================================================================================================
+#===================================================================================================================
+
+def clean_faf(df, faz_gateway, faz_county, sd_flows, othermode_truck, commodity_group):
+    """
+    Process a DataFrame containing transportation data.
+
+    Parameters:
+    - df (DataFrame): The input DataFrame containing transportation data.
+    - faz_gateway (DataFrame): DataFrame containing FAZ and Gateways/Airports/Ports TAZ.
+    - faz_county (DataFrame): DataFrame containing FAZ and County information.
+    - sd_flows (DataFrame): DataFrame containing Origin and Destination pair that will generate truck trips to/from SANDAG or Pass through SANDAG region.
+    - othermode_truck (DataFrame): DataFrame with mode information and percentages.
+    - commodity_group (DataFrame): DataFrame mapping commodities to CG values.
+
+    Returns:
+    - processed_df (DataFrame): Processed DataFrame with updated columns.
+
+    Description:
+    This function takes FAF data and performs the following steps:
+    1. Determine what are the OD pairs that may pass through SANDAG region or start/end in SANDAG
+    2. Includes only specified modes and calculates tonnage.
+    3. Deletes unnecessary columns.
+    4. Assigns SANDAG commodity groups based on SCTG commodity group.
+    5. Aggregates tonnage data by OD FAZ and Commodity Group.
+    6. Adds columns identifying if OD pairs have at least one end within Orange County or Mexico.
+    The processed DataFrame is returned.
+    """
+    sd_flows = df_dict[sd_flows]
+    faz_gateway = df_dict[faz_gateway]
+    faz_county = df_dict[faz_county]
+    othermode_truck = df_dict[othermode_truck]
+    commodity_group = df_dict[commodity_group]
+
+
+    # 1. Determine what are the OD pairs that may pass through SANDAG region or start/end in SANDAG
+    # Mapping columns 'dms_orig' and 'dms_dest' based on the lookup table to determine the area code (check out the reference tab in the input spreadsheet for more info)
+    df = df.merge(faz_gateway[['FAZ', 'AreaCode']], how='left', left_on='dms_orig', right_on='FAZ').drop('FAZ', axis=1).rename(columns={'AreaCode': 'code_orig'})
+    df = df.merge(faz_gateway[['FAZ', 'AreaCode']], how='left', left_on='dms_dest', right_on='FAZ').drop('FAZ', axis=1).rename(columns={'AreaCode': 'code_dest'})
+    # if FAZ is within SANDAG, the areacode should be 8
+    faz_sandag = faz_county[faz_county["County"] == "San Diego"]["FAZ"]
+    df.loc[df['dms_orig'].isin(faz_sandag), 'code_orig'] = 8
+    df.loc[df['dms_dest'].isin(faz_sandag), 'code_dest'] = 8
+    # merge the distribution of truck trips between each OD pair
+    df = df.merge(sd_flows[['OriginCode', 'DestinationCode', 'Dist']], how='left', left_on=['code_orig', 'code_dest'], right_on=['OriginCode', 'DestinationCode']).drop(['OriginCode', 'DestinationCode'], axis=1)
+    # calculate how much of each OD pairs will travel through SANDAG - multiply by the ton
+    df['distons_2025'] = df['distons_2025'] * df['Dist']
+    # remove all OD pairs that do not have any tons traveled through or within SANDAG
+    df = df[df['distons_2025'] > 0]
+    df.drop(['Dist'], axis=1, inplace=True)
+
+    # 2. Include some modes
+    mode_to_include = othermode_truck.set_index('Mode_Num')['Percentage'].to_dict()
+    df = df[df['Mode'].isin(mode_to_include.keys())]
+    df['truck_perc'] = df['Mode'].map(mode_to_include)
+    df['ton'] = df['distons_2025'] * df['truck_perc']
+    df.drop(['truck_perc', 'distons_2025'], axis=1, inplace=True)
+
+    # 3. Delete unnecessary columns
+    delete_col = ['Direction', 'Trade', 'disvalue_2017', 'distons_2017', 'disvalue_2025', 'distons_2030', 'disvalue_2030', 'distons_2035', 
+                    'disvalue_2035', 'distons_2040', 'disvalue_2040', 'distons_2045', 'disvalue_2045', 'distons_2050', 'disvalue_2050'] 
+    df.drop(delete_col, axis=1, inplace=True)
+
+    # 4. Assign SANDAG commodity groups based on SCTG commodity group
+    commodity_to_cg = commodity_group.set_index('SCTG')['CG'].to_dict()
+    df['CG'] = df['Commodity'].map(commodity_to_cg)
+    df.drop('Commodity', axis=1, inplace=True)
+
+    # 5. Aggregate the Tonnage Data by Origin/Destination and Commodity Group
+    df['fr_orig'] = df['fr_orig'].fillna(0)
+    df['fr_dest'] = df['fr_dest'].fillna(0)
+    df = df.groupby(['dms_orig', 'dms_dest', 'fr_orig', 'fr_dest', 'code_orig', 'code_dest', 'CG'], as_index=False).agg({'ton': 'sum'})
+
+    # 6. Create a new column that identifies if at least one end of the OD is within Orange County and mexico
+    df['one_end_orange'] = ((df['code_orig'] == 6) | (df['code_dest'] == 6)).astype(int)
+    df['one_end_mx'] = ((df['fr_orig'].isin([802])) | (df['fr_dest'].isin([802]))).astype(int)                            
+
+    return df
+
+#===================================================================================================================
+#===================================================================================================================
+
+def prepare_taz_data(mgra, faz_county, emp_converter, taz_faz):
+    """
+    Prepare TAZ data by calculating NAICS employee category percentages within each TAZ.
+
+    Parameters:
+    - mgra (DataFrame): DataFrame containing MGRA data.
+    - faz_county (DataFrame): DataFrame containing FAZ and County information.
+    - emp_converter (DataFrame): DataFrame mapping SANDAG emp category to NAICS emp category.
+    - taz_faz (DataFrame): DataFrame mapping TAZ values to FAZ values.
+
+    Returns:
+    - taz_long (DataFrame): Processed DataFrame with calculated employee category percentages.
+
+    Description:
+    This function calculates the percentage of each employee category (from the NAICS Dataset)
+    within each TAZ. It performs the following steps:
+    1. Calculate the number of SADNAG employees by TAZ.
+    2. Add FAZ to the emp by TAZ table
+    3. Reformat the TAZ table to long format using emp_ and pop columns.
+    4. Reformat the emp_converter table to long format using FAZ columns
+    5. Convert SANDAG emp category to NAICS emp category.
+    6. Calculate the number of NAICS employees by TAZ.
+    7. Aggregate total number of each NAICS employee category by TAZ.
+    6. Create a mapping of TAZ values to FAZ values.
+    8. Calculate how many percentage of each emp category is within a TAZ
+    The processed DataFrame is returned.
+    """
+
+    emp_converter = df_dict[emp_converter]
+    taz_faz = df_dict[taz_faz]
+    faz_county = df_dict[faz_county]
+    
+    # 1. Calculate the number of employee by TAZ
+    cols_to_sum = ['pop'] + [col for col in mgra.columns if col.startswith('emp_')]
+    taz = mgra.groupby(['taz'], as_index=False)[cols_to_sum].sum()
+
+    # 2. Add FAZ to the emp by TAZ table
+    taz_to_faz = taz_faz.set_index('TAZ')['FAZ'].to_dict()
+    # Use the map function to directly assign TAZ values
+    taz['FAZ'] = taz['taz'].map(taz_to_faz)
+
+    # 3. Reformat the taz table to long format using emp_ columns and pop
+    taz_long = taz.melt(id_vars=['taz', 'FAZ'], value_vars=cols_to_sum, value_name='sandag_emp_num', var_name='sandag_emp')
+
+    # 4. Reformat the emp_converter table to long format using FAZ columns
+    faz_sandag = faz_county[faz_county["County"] == "San Diego"]["FAZ"]
+    emp_converter_long = emp_converter.melt(id_vars=['NAICS_Emp', 'SANDAG_Emp'], value_vars=faz_sandag, value_name='Emp_PCT', var_name='FAZ')
+
+    # 5. Convert SANDAG emp category to NAICS emp category
+    taz_long = emp_converter_long.merge(taz_long, how='inner', right_on=['sandag_emp', 'FAZ'], left_on=['SANDAG_Emp', 'FAZ']).drop(['sandag_emp'], axis=1)
+
+    # 6. Calculate the number of NAICS employees by TAZ.
+    taz_long['naics_emp_num'] = taz_long['sandag_emp_num'] * taz_long['Emp_PCT']
+
+    # 7. Aggregate NAICS employee number by TAZ
+    taz_long = taz_long.groupby(['taz', 'FAZ', 'NAICS_Emp'], as_index=False).agg({'naics_emp_num': 'sum'})
+
+    # 8. Calculate how many percentage of each emp category is within a TAZ
+    taz_long['emp_naics_perc'] = taz_long['naics_emp_num'] / taz_long.groupby(['NAICS_Emp', 'FAZ'])['naics_emp_num'].transform('sum')
+    taz_long = taz_long.loc[taz_long['naics_emp_num'] > 0]
+
+    return taz_long
+
+#===================================================================================================================
+#===================================================================================================================
+
+def faf_disaggregate_to_taz(df, faz_gateway, cg_emp_a, cg_emp_p, faz_county, taz, annual_factor):
+    """
+    Parameters:
+    - df (DataFrame): faf data with aggregated commodity
+    - faz_gateway (DataFrame): DataFrame mapping FAZ to SADNAG gateways.
+    - faz_county (DataFrame): DataFrame containing FAZ and County information.
+
+    Returns:
+    - processed_df (DataFrame): Processed DataFrame with daily tonnage by TAZ origin and destination.
+
+    Description:
+    This function performs the following steps:
+    1. Determine NAICS emp category for production and attraction in FAF.
+    2. Bring TAZ numbers and the percentage of each emp category within each TAZ to relatively distribute tonnage to TAZ.
+    3. For FAZ outside the SANDAG region, assume that the distribution of tonnage is 1 for both attraction/production.
+    4. Calculate annual tonnage for each OD pair.
+    5. Reformat the faz_gateway table to long format using emp_ columns and pop.
+    6. Assign Gateway TAZ and percentage to the DataFrame.
+    7. Calculate final tonnage where one end of trip is outside the SANDAG region (taz_a or taz_p is null) and assign corresponding gateways as taz_a or taz_p.
+    8. Group by TAZ attraction and production and sum up the annual tonnage.
+    9. Convert annual to daily tonnage . the annual to daily factor is basically the number of working days within a year.
+
+    The processed DataFrame is returned.
+    """
+    faz_gateway = df_dict[faz_gateway]
+    cg_emp_a = df_dict[cg_emp_a]
+    cg_emp_p = df_dict[cg_emp_p]
+    faz_county = df_dict[faz_county]
+    annual_factor = df_dict[annual_factor]
+
+
+    # 1. Determine NAICS emp category for production and attraction in FAF
+    cg_to_emp_a = cg_emp_a.set_index('CG')['Emp_a'].to_dict()
+    cg_to_emp_p = cg_emp_p.set_index('CG')['Emp_p'].to_dict()
+
+    # Use the map function to directly assign Emp_a and Emp_p values
+    df['Emp_a'] = df['CG'].map(cg_to_emp_a)
+    df['Emp_p'] = df['CG'].map(cg_to_emp_p)
+    
+    # 2. Bring TAZ numbers and the percentage of each emp category within each TAZ to relatively distribute tonnage to TAZ.
+    df = df.merge(taz[['taz', 'FAZ', 'NAICS_Emp', 'emp_naics_perc']], how='left', left_on=['Emp_p', 'dms_orig'], right_on=['NAICS_Emp', 'FAZ']).drop(['FAZ', 'NAICS_Emp', 'Emp_p'], axis=1)
+    df.rename(columns={'taz': 'taz_p', 'emp_naics_perc': 'emp_naics_perc_p'}, inplace=True)
+
+    df = df.merge(taz[['taz', 'FAZ', 'NAICS_Emp', 'emp_naics_perc']], how='left', left_on=['Emp_a', 'dms_dest'], right_on=['NAICS_Emp', 'FAZ']).drop(['FAZ', 'NAICS_Emp', 'Emp_a'], axis=1)
+    df.rename(columns={'taz': 'taz_a', 'emp_naics_perc': 'emp_naics_perc_a'}, inplace=True)
+
+    # 3. For FAZ outside the SANDAG region, assume that the distribution percentage is 1 for both attraction/production
+    # create a list of FAZ in San Diego
+    faz_sandag = faz_county[faz_county["County"] == "San Diego"]["FAZ"]
+    df.loc[(~df['dms_orig'].isin(faz_sandag)) & (df['emp_naics_perc_p'].isnull()), 'emp_naics_perc_p'] = 1
+    df.loc[(~df['dms_dest'].isin(faz_sandag)) & (df['emp_naics_perc_a'].isnull()), 'emp_naics_perc_a'] = 1
+    
+    # 4. Calculate annual tonnage for each OD pair
+    df['dist_perc'] = df['emp_naics_perc_a'] * df['emp_naics_perc_p']
+    df['ton_annual'] = df['ton'] * df['dist_perc'] * 1000  # FAF data is in thousand tons
+    df.drop(['ton', 'emp_naics_perc_p', 'emp_naics_perc_a', 'dist_perc'], axis=1, inplace=True)
+    
+    # 5. Reformat the faz_gateway table to long format using emp_ columns and pop
+    gateway_airport_port = faz_gateway.columns[4:]
+    faz_gateway_long = faz_gateway.melt(id_vars=['FAZ'], value_vars=gateway_airport_port, value_name='faz_gtw_perc', var_name='gateways').dropna(subset=['faz_gtw_perc'])
+
+    # 6. Assign Gateway TAZ and percentage to the DataFrame
+    df = df.merge(faz_gateway_long, how='left', left_on='dms_orig', right_on='FAZ').drop('FAZ', axis=1)
+    df.rename(columns={'gateways': 'gateways_p', 'faz_gtw_perc': 'faz_gtw_perc_p'}, inplace=True)
+
+    df = df.merge(faz_gateway_long, how='left', left_on='dms_dest', right_on='FAZ').drop('FAZ', axis=1)
+    df.rename(columns={'gateways': 'gateways_a', 'faz_gtw_perc': 'faz_gtw_perc_a'}, inplace=True)
+
+    # 7. Calculate final tonnage 
+    # For the end that is within SANDAG region, gateway distributions should be ignored 
+    df.loc[(df['dms_orig'].isin(faz_sandag)) & (df['faz_gtw_perc_p'].isnull()), 'faz_gtw_perc_p'] = 1
+    df.loc[(df['dms_dest'].isin(faz_sandag)) & (df['faz_gtw_perc_a'].isnull()), 'faz_gtw_perc_a'] = 1
+
+    df['faz_gtw_perc'] = df['faz_gtw_perc_p'] * df['faz_gtw_perc_a']
+    df['ton_tot'] = df['ton_annual'] * df['faz_gtw_perc']
+
+    # df.loc[df['taz_a'].isnull(), 'ton_tot'] = df.loc[df['taz_a'].isnull(), 'ton_annual'] * df.loc[df['taz_a'].isnull(), 'faz_gtw_perc_a']
+    df.loc[df['taz_a'].isnull(), 'taz_a'] = df.loc[df['taz_a'].isnull(), 'gateways_a']
+
+    # df.loc[df['taz_p'].isnull(), 'ton_tot'] = df.loc[df['taz_p'].isnull(), 'ton_annual'] * df.loc[df['taz_p'].isnull(), 'faz_gtw_perc_p']
+    df.loc[df['taz_p'].isnull(), 'taz_p'] = df.loc[df['taz_p'].isnull(), 'gateways_p']
+    
+    # 8. Group by TAZ attraction and production and sum up the annual tonnage
+    # Note that there is a column that shows if at least one end of the trip is within Orange County. This column will be used in future steps to determine the OD distance.
+    processed_df = df.groupby(['taz_a', 'taz_p', 'CG', 'one_end_orange', 'one_end_mx'], as_index=False).agg({'ton_tot': 'sum'})
+
+    # 9. Convert annual to daily tonnage . the annual to daily factor is basically the number of working days within a year.
+    annual_to_daily_factor = annual_factor['Factor'].values
+    processed_df['ton_daily'] = processed_df['ton_tot'] / annual_to_daily_factor
+
+    # Drop unnecessary columns
+    processed_df.drop(['ton_tot'], axis=1, inplace=True)
+
+    return processed_df
+
+
+#===================================================================================================================
+#===================================================================================================================
+
+def daily_ton_to_truck_by_type_and_tod(df, traffic_skims, truck_dist, payload, time_of_day):
+    """
+    Convert daily tonnage to number of trucks by type and time of day.
+
+    Parameters:
+    - df (DataFrame): DataFrame containing the daily tonnage by TAZ.
+    - traffic_skims (DataFrame): DataFrame containing OD pair distance data.
+    - truck_dist (DataFrame): DataFrame containing truck type distribution data.
+    - payload (DataFrame): DataFrame containing payload data for truck types.
+    - time_of_day (DataFrame): DataFrame containing time of day factors.
+
+    Returns:
+    - final_df (DataFrame): Processed DataFrame with number of trucks by type and time of day (TOD).
+
+    Description:
+    This function performs the following steps:
+    1. Identify distance between two OD pairs within San Diego using skim.
+    2. Categorize the distance into different categories.
+        1. less than 50 miles; 
+        2. 51 to 100 miles ; 
+        3. One end in OC ; 
+        4. 201 miles or more; or one end outside of SANDAG and Orange county regions.
+    3. Distribute daily tonnage between truck types based on OD distance.
+    4. Convert tonnage by truck type to the number of trucks using truck type and commodity.
+    5. Distribute the number of trucks by time of day.
+    The final processed DataFrame is returned.
+    """
+    truck_dist = df_dict[truck_dist]
+    payload = df_dict[payload]
+    time_of_day = df_dict[time_of_day]
+
+    # 1. Identify distance between two OD pairs within San Diego using skim
+    df = df.merge(traffic_skims, how='left', left_on=['taz_a', 'taz_p'], right_on=['destination', 'origin']).drop(['origin', 'destination'], axis=1)
+
+    # 2. Categorize the distance
+    def categorize_dist(value):
+        if value <= 50:
+            return 1
+        elif value <= 100:
+            return 2
+        elif value <= 150:
+            return 3
+        elif value > 200:
+            return 4
+        else:
+            return 0
+
+    df['dist_cat'] = np.where(df['one_end_orange'] == 1, 3, 0)
+    df['dist_cat'] = np.where(df['one_end_mx'] == 1, 5, 0)
+    # df['dist_cat'] = np.where(df['dist_cat'] == 0, df['PM_TRK_H_DIST'].apply(categorize_dist), df['dist_cat'])
+    df['dist_cat'] = np.where(df['dist_cat'] == 0, df['TRK_H_DIST__PM'].apply(categorize_dist), df['dist_cat'])
+    df['dist_cat'] = np.where(df['dist_cat'] == 0, 4, df['dist_cat'])
+    # df.drop(['one_end_orange', 'PM_TRK_H_DIST'], axis=1, inplace=True)
+    df.drop(['one_end_orange', 'TRK_H_DIST__PM'], axis=1, inplace=True)
+
+    # 3. Distribute daily tonnage between truck types based on OD distance
+    df = df.merge(truck_dist[['Dist_GP', 'Truck_Type', 'Dist']], how='left', left_on='dist_cat', right_on='Dist_GP').drop(['Dist_GP'], axis=1)
+    df['ton_daily_bytruck'] = df['ton_daily'] * df['Dist']
+    df.drop(['ton_daily', 'dist_cat', 'Dist'], axis=1, inplace=True)
+    df = df.groupby(['taz_a', 'taz_p', 'CG', 'Truck_Type'], as_index=False)['ton_daily_bytruck'].sum()
+
+    # 4. Convert tonnage by truck type to the number of trucks using truck type and commodity
+    max_tonnage_dict = payload.set_index(['CG', 'Truck_Type'])['Pounds'].to_dict()
+    df['Tonnage'] = df.apply(lambda row: max_tonnage_dict.get((row['CG'], row['Truck_Type']), 0), axis=1)
+    df = df.loc[df['Tonnage'] > 0]
+    df['tot_truck'] = (df['ton_daily_bytruck'] / df['Tonnage']) * 2000
+    df.drop(['ton_daily_bytruck', 'Tonnage'], axis=1, inplace=True)
+
+    # 5. Distribute the number of trucks by time of day
+    peak_periods = ['AM', 'MD', 'PM', 'EA', 'EV']
+    for period in peak_periods:
+        factor_column = time_of_day.loc[time_of_day['Peak_Period'] == period, 'Factor'].values
+        df[f'{period.lower()}_truck'] = df['tot_truck'] * factor_column
+
+    final_df = df
+
+    return final_df
+
+#===================================================================================================================
+#===================================================================================================================
+#===================================================================================================================
+
+## Main Script
+
+arguments = sys.argv
+
+model_dir = arguments[1]
+output_dir = arguments[2]
+#assignment_dir = arguments[3]
+faf_file_name = arguments[3]
+mgra_file_name = arguments[4]
+skim_file_name = arguments[5]
+
+# Step 1: Load and preprocess data
+#df, mgra, traffic_skims, df_dict = load_and_preprocess_data("inputs_sandag_HTM.xlsx")
+df, mgra, traffic_skims, df_dict = load_and_preprocess_data(model_dir + "\input\htm\inputs_sandag_HTM.xlsx", model_dir, output_dir, faf_file_name, mgra_file_name, skim_file_name)
+
+# Step 2: Prepare TAZ data
+taz = prepare_taz_data(mgra, 'faz_county', 'emp_converter', 'taz_faz')
+
+
+# Process df in chunks
+# chunk_size = 100000
+chunk_size = 400000
+num_chunks = len(df) // chunk_size + 1
+final_results = []
+final_ton = []
+
+for chunk_num in range(num_chunks):
+    start_idx = chunk_num * chunk_size
+    end_idx = (chunk_num + 1) * chunk_size
+    df_chunk = df[start_idx:end_idx]
+
+    # Step 3: Clean FAF data
+    df_chunk = clean_faf(df_chunk, 'faz_gateway', 'faz_county', 'sd_flows', 'othermode_truck', 'commodity_group')
+
+    # Step 4: FAF disaggregation to TAZ
+    df_chunk = faf_disaggregate_to_taz(df_chunk, 'faz_gateway', 'cg_emp_a', 'cg_emp_p', 'faz_county', taz, 'annual_factor')
+
+    # Step 5: Daily tonnage to truck types and time of day
+    final_chunk = daily_ton_to_truck_by_type_and_tod(df_chunk, traffic_skims, 'truck_dist', 'payload', 'time_of_day')
+    
+    # Append the results of the chunk to the final_results list
+    final_results.append(final_chunk)
+    final_ton.append(df_chunk)
+
+    # Print progress
+    progress = (chunk_num + 1) / num_chunks * 100
+    print(f"Processing chunk {chunk_num + 1}/{num_chunks} - {progress:.2f}% done")
+
+
+# Combine the results of all chunks
+final_truck = pd.concat(final_results, ignore_index=True)
+final_ton = pd.concat(final_ton, ignore_index=True)
+
+#===================================================================================================================
+
+# combine medium1 and 2 truck type to a medium type
+htm = final_truck
+htm.loc[htm['Truck_Type'] == "Medium1", 'Truck_Type'] = 'Medium'
+htm.loc[htm['Truck_Type'] == "Medium2", 'Truck_Type'] = 'Medium'
+
+htm = htm.groupby(['Truck_Type', 'taz_a', 'taz_p'], as_index=False)[['tot_truck', 'am_truck','md_truck', 'pm_truck', 'ea_truck', 'ev_truck']].sum()
+
+#===================================================================================================================
+# Identify the trip types: ei,ie (drop ii since it is not an HTM trip)
+external = [1,2,3,4,5,6,7,8,9,10,11,12, 1154, 1294, 1338, 1457,1476,1485,1520, 2086,2193,2372,2384,2497,3693,4184]
+htm['ei'] = 0
+htm['ie'] = 0
+htm.loc[(~htm.taz_a.isin(external)) & (htm.taz_p.isin(external)), 'ei'] = 1
+htm.loc[(htm.taz_a.isin(external)) & (~htm.taz_p.isin(external)), 'ie'] = 1
+# Define truck types and columns to process
+truck_types = htm['Truck_Type'].unique()
+columns_to_process = ['am_truck', 'md_truck', 'pm_truck', 'ea_truck', 'ev_truck']
+conditions = ['ei', 'ie']
+
+# Create empty matrices
+matrix_size = 4947  #ABM3 zone system
+#matrix_size = 4996
+empty_matrix = np.zeros((matrix_size, matrix_size))
+
+for column in columns_to_process:
+    matrices = {}
+    for condition in conditions:
+        condition_matrices = {}
+        for i, truck_type in enumerate(truck_types):
+            filtered_data = htm[(htm[condition] == 1) & (htm['Truck_Type'] == truck_type)]
+            filtered_data_group = filtered_data.groupby(['Truck_Type', 'taz_a', 'taz_p'], as_index=False)[[column]].sum()
+            
+            # Create a matrix filled with zeros
+            matrix = pd.pivot_table(filtered_data_group, values=column, index='taz_a', columns='taz_p').fillna(0)
+            
+            # Reindex to ensure the matrix is of size 4996x4996
+            matrix = matrix.reindex(range(1, matrix_size + 1), fill_value=0).fillna(0)
+            matrix = matrix.reindex(columns=range(1, matrix_size + 1), fill_value=0).fillna(0)
+            
+            # Convert the matrix to a NumPy array
+            condition_matrices[f"{truck_type}_{condition}"] = matrix.values
+
+        matrices[f"{column}_{condition}_matrices"] = condition_matrices
+
+    # Create OMX file for each column
+    htm_period = column.split("_")[0].upper()
+    file_name = f"\htmtrips_{htm_period}.omx"
+    matrix_file = output_dir + "\htm" + file_name
+
+    # Create an OMX file and write matrices
+    #with omx.open_file(file_name, 'w') as f:
+    with omx.open_file(matrix_file, 'w') as f:
+        for matrix_name, matrix_data in matrices.items():
+            for sub_matrix_name, sub_matrix_data in matrix_data.items():
+                #clean_matrix_name = matrix_name.replace('_', '')  # Remove underscores
+                matrix_tod = matrix_name.split("_")[0].upper()
+                clean_sub_matrix_name = sub_matrix_name.replace('_', '')  # Remove underscores
+                #f[f"{clean_matrix_name}_{clean_sub_matrix_name}"] = sub_matrix_data
+                f[f"{clean_sub_matrix_name}_{matrix_tod}"] = sub_matrix_data
+    #copy_dst = assignment_dir + "\\" + file_name
+    #shutil.copyfile(matrix_file, copy_dst)
+    print(f"Matrices for '{column}' saved as '{file_name}'")
\ No newline at end of file