diff --git a/.github/workflows/update_hourly_data.yml b/.github/workflows/update_hourly_data.yml new file mode 100644 index 0000000..da06a4d --- /dev/null +++ b/.github/workflows/update_hourly_data.yml @@ -0,0 +1,39 @@ +name: Update Hourly Data + +on: + schedule: + - cron: "15 4 * * *" + workflow_dispatch: + +jobs: + update_data: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Check directory structure + run: | + echo "Current directory: $(pwd)" + echo "Contents of current directory:" + ls -la + echo "Python files in repository:" + find . -type f -name "*.py" | sort + + - name: Run Henry Hub ETL + env: + EIA_API_KEY: ${{ secrets.EIA_API_KEY }} + GCP_SERVICE_ACCOUNT: ${{ secrets.GCP_SERVICE_ACCOUNT }} + run: python ${{ github.workspace }}/data_load/update_hourly_data.py \ No newline at end of file diff --git a/app/market_analysis.py b/app/market_analysis.py index 067dcde..76dc8aa 100644 --- a/app/market_analysis.py +++ b/app/market_analysis.py @@ -54,6 +54,16 @@ "LONGIL": "K", } +today = datetime.date.today() +latest_month = today.strftime("%Y-%m") +time_period = ( + pd.date_range(start="2017-01", end=latest_month, freq="MS") + .strftime("%Y-%m") + .tolist() +) +time_period = time_period[::-1] # reverse to show latest month first +one_month_ago = (today.replace(day=1) - datetime.timedelta(days=1)).strftime("%Y-%m") + # ------ Credentials ------ creds = st.secrets["gcp_service_account"] credentials = service_account.Credentials.from_service_account_info(creds) @@ -61,20 +71,14 @@ # ------ API loaders ------- @st.cache_data(ttl=3600) -def load_nyiso_realtime(selected_month) -> any: - - start_date = datetime.datetime.strptime(selected_month, "%Y-%m-%d") +def load_nyiso_realtime(month) -> any: - if start_date.month == 12: - end_date = datetime.datetime(start_date.year + 1, 1, 1) - else: - end_date = datetime.datetime(start_date.year, start_date.month + 1, 1) + month_list = ",".join(f"'{m}'" for m in month) sql = f""" - SELECT Time_Stamp, Name, LBMP____MWHr_ - FROM `sipa-adv-c-dancing-cactus.dataset.market_analysis` - WHERE Time_Stamp >= '{start_date.strftime("%Y-%m-%d")}' - AND Time_Stamp < '{end_date.strftime("%Y-%m-%d")}' + SELECT hourly_time_stamp, Name, LBMP + FROM `sipa-adv-c-dancing-cactus.dataset.hourly_lbmp` + WHERE FORMAT_DATE('%Y-%m', hourly_time_stamp) IN ({month_list}) """ df = pandas_gbq.read_gbq(sql, credentials=credentials) return df @@ -100,39 +104,35 @@ def load_henry_hub_data() -> pd.DataFrame: @st.cache_data(ttl=3600) -def merge_load_and_lbmp(year: int, month: int) -> pd.DataFrame: - start = datetime.datetime(year, month, 1) - if month == 12: - end = datetime.datetime(year + 1, 1, 1) - else: - end = datetime.datetime(year, month + 1, 1) +def merge_load_and_lbmp(month: list) -> pd.DataFrame: + month_list = ",".join(f"'{m}'" for m in month) sql_1 = f""" SELECT Time_Stamp, Name, Load FROM `sipa-adv-c-dancing-cactus.dataset.actual_load` - WHERE Time_Stamp >= '{start.strftime("%Y-%m-%d")}' - AND Time_Stamp < '{end.strftime("%Y-%m-%d")}' + WHERE FORMAT_DATE('%Y-%m', Time_Stamp) IN ({month_list}) """ load_df = pandas_gbq.read_gbq(sql_1, credentials=credentials) sql_2 = f""" - SELECT Time_Stamp, Name, LBMP____MWHr_ - FROM `sipa-adv-c-dancing-cactus.dataset.market_analysis` - WHERE Time_Stamp >= '{start.strftime("%Y-%m-%d")}' - AND Time_Stamp < '{end.strftime("%Y-%m-%d")}' + SELECT hourly_time_stamp, Name, LBMP + FROM `sipa-adv-c-dancing-cactus.dataset.hourly_lbmp` + WHERE FORMAT_DATE('%Y-%m', hourly_time_stamp) IN ({month_list}) """ lbmp_df = pandas_gbq.read_gbq(sql_2, credentials=credentials) load_df["Time_Stamp"] = pd.to_datetime(load_df["Time_Stamp"], errors="coerce") - lbmp_df["Time_Stamp"] = pd.to_datetime(lbmp_df["Time_Stamp"], errors="coerce") + lbmp_df["hourly_time_stamp"] = pd.to_datetime( + lbmp_df["hourly_time_stamp"], errors="coerce" + ) merged = pd.merge( load_df, lbmp_df, left_on=["Time_Stamp", "Name"], - right_on=["Time_Stamp", "Name"], + right_on=["hourly_time_stamp", "Name"], how="inner", ) return merged @@ -142,18 +142,20 @@ def merge_load_and_lbmp(year: int, month: int) -> pd.DataFrame: def get_processed_electricity_data(df: pd.DataFrame, zone: str) -> pd.DataFrame: daily_df = ( df.groupby("Name") - .resample("D", on="Time_Stamp")["LBMP____MWHr_"] + .resample("D", on="hourly_time_stamp")["LBMP"] .agg(["mean", "max", "min"]) .reset_index() ) zone_df = daily_df.loc[daily_df["Name"] == zone].copy() - zone_df = zone_df.sort_values("Time_Stamp") + zone_df = zone_df.sort_values("hourly_time_stamp") return zone_df def create_comparison_graph(electricity_df: pd.DataFrame, gas_df: pd.DataFrame) -> None: - base = alt.Chart(electricity_df).encode(alt.X("Time_Stamp").axis(title="Date")) + base = alt.Chart(electricity_df).encode( + alt.X("hourly_time_stamp").axis(title="Date") + ) area = base.mark_area(opacity=0.3, color="lightblue").encode( alt.Y("max").axis(title="LBMP($/MWh)", titleColor="blue"), @@ -184,15 +186,20 @@ def create_demand_chart(LBMP_load: pd.DataFrame, selected_zone: str) -> alt.Char labels=["Night", "Morning", "Afternoon", "Evening"], ) + df_filtered["month"] = ( + pd.to_datetime(df_filtered["Time_Stamp"]).dt.to_period("M").astype(str) + ) + points = ( alt.Chart(df_filtered) .mark_circle(opacity=0.4, size=20) .encode( x=alt.X("Load", title="Load (MW)", scale=alt.Scale(zero=False)), - y=alt.Y("LBMP____MWHr_", title="Electricity price ($/MWh)"), + y=alt.Y("LBMP", title="Electricity price ($/MWh)"), color=alt.Color("time of day:N", title="Time of Day"), - tooltip=["Time_Stamp", "Load", "LBMP____MWHr_"], + tooltip=["Time_Stamp", "Load", "LBMP"], ) + .facet("month:N", columns=2) ) return points.properties( @@ -215,7 +222,7 @@ def prepare_map_data(lbpm_load: pd.DataFrame) -> pd.DataFrame: df = lbpm_load.copy() df["Time_Stamp"] = pd.to_datetime(df["Time_Stamp"], errors="coerce") - df = df.dropna(subset=["Time_Stamp", "Name", "Load", "LBMP____MWHr_"]) + df = df.dropna(subset=["Time_Stamp", "Name", "Load", "LBMP"]) df["zone_code"] = df["Name"].map(MAP_ZONE_MAP) df = df.dropna(subset=["zone_code"]) @@ -225,7 +232,7 @@ def prepare_map_data(lbpm_load: pd.DataFrame) -> pd.DataFrame: grouped = df.groupby(["map_date", "map_hour", "zone_code"], as_index=False).agg( avg_load=("Load", "mean"), - avg_lbmp=("LBMP____MWHr_", "mean"), + avg_lbmp=("LBMP", "mean"), ) return grouped @@ -314,9 +321,11 @@ def render_zone_map(lbpm_load: pd.DataFrame) -> None: col1, col2 = st.columns(2) available_dates = sorted(map_df["map_date"].unique()) + selected_date = col1.selectbox( "Select date for maps", available_dates, + index=len(available_dates) - 1, key="map_date_select", ) @@ -402,7 +411,7 @@ def graph_legend() -> str: def demand_interpretation(df: pd.DataFrame, zone: str) -> str: avg_load = df["Load"].mean() - avg_lbmp = df["LBMP____MWHr_"].mean() + avg_lbmp = df["LBMP"].mean() max_load = df["Load"].max() return ( @@ -459,15 +468,19 @@ def render_intro() -> None: st.divider() -def render_demand_section(year: int, month: int) -> None: +def render_demand_section(month: int) -> None: st.header("Electricity Price vs. Load") st.write(""" This section explores the relationship between electricity prices and demand (load) in the selected NYISO zone and month. - The scatter plot shows how LBMP varies with load, colored by time of day. + How does the price change as load increases? Are there certain times of day when prices are more volatile? """) with st.spinner("Loading demand data..."): - LBMP_load = merge_load_and_lbmp(year, month) + LBMP_load = merge_load_and_lbmp(month) + + st.subheader("Spatial Distribution Across NYISO Zones") + + render_zone_map(LBMP_load) selected_zone = st.selectbox( "Select a NYISO zone", @@ -476,6 +489,7 @@ def render_demand_section(year: int, month: int) -> None: index=list(ZONE_MAP.keys()).index("N.Y.C."), key="demand_zone_select", ) + chart = create_demand_chart(LBMP_load, selected_zone) st.altair_chart(chart, use_container_width=True) @@ -483,14 +497,9 @@ def render_demand_section(year: int, month: int) -> None: st.write(demand_interpretation(LBMP_load, selected_zone)) st.divider() - st.subheader("Spatial Distribution Across NYISO Zones") - - render_zone_map(LBMP_load) - - st.divider() -def render_electricity_section(year: int, month: int) -> None: +def render_electricity_section(month: int) -> None: st.header("The Comparison of Electricity and Gas Markets") st.write( @@ -501,15 +510,8 @@ def render_electricity_section(year: int, month: int) -> None: ) # input month and zone - selected_month = datetime.date(year, month, 1) - selected_month_str = selected_month.strftime("%Y-%m-%d") - - if selected_month > datetime.date.today(): - st.error("No data available.") - st.stop() - try: - realtime_df = load_nyiso_realtime(selected_month_str) + realtime_df = load_nyiso_realtime(month) selected_zone = st.selectbox( "Select a NYISO zone", options=list(ZONE_MAP.keys()), @@ -521,9 +523,7 @@ def render_electricity_section(year: int, month: int) -> None: zone_df = get_processed_electricity_data(realtime_df, selected_zone) gas_df = load_henry_hub_data() - filtered = gas_df[ - (gas_df["date"].dt.year == year) & (gas_df["date"].dt.month == month) - ] + filtered = gas_df[(gas_df["date"].dt.to_period("M").astype(str).isin(month))] chart = create_comparison_graph(zone_df, filtered) st.altair_chart(chart, use_container_width=True) @@ -647,14 +647,33 @@ def main() -> None: render_sidebar() render_intro() - year = st.selectbox( - "Year for electricity data", range(2017, 2027), index=9, key="global_year" - ) - month = st.selectbox("Month for electricity data", range(1, 13), key="global_month") + col_start, col_end = st.columns(2) + with col_start: + start_month = st.select_slider( + "Start Month", options=time_period[::-1], value=one_month_ago + ) + with col_end: + end_month = st.select_slider( + "End Month", options=time_period[::-1], value=latest_month + ) + + start_idx = time_period.index(start_month) + end_idx = time_period.index(end_month) + if start_idx < end_idx: + start_idx, end_idx = end_idx, start_idx + month = time_period[end_idx : start_idx + 1] + + if not month: + st.warning("invalid period selected.") + st.stop() + + if len(month) > 4: + st.warning("Please select a period of 4 months or less to ensure performance.") + st.stop() - render_demand_section(year, month) + render_demand_section(month) - render_electricity_section(year, month) + render_electricity_section(month) render_comparison_section(gas_available=True) diff --git a/app/proposal.py b/app/proposal.py index 6f69786..b07a31b 100644 --- a/app/proposal.py +++ b/app/proposal.py @@ -62,11 +62,8 @@ def main() -> None: """ * How does the Locational Based Marginal Price(LBMP) in NYC fluctuate according to the change in demand? (Although LBMP is a wholesale price, not a retail price, analyzing LBMP instead of retail price is insightful because LBMP reflects dynamic demand shifts more clearly than retail price, which does not change so often compared to the wholesale price.) - * How does the change in fuel mix during the day affect the LBMP in NY state? I can expect that LBMP will fall during daytime or sunny days and rise during night and rainy days because renewables such as solar energy can generate electricity at a cheaper price. * How does the change in energy prices, for example, the rise of the price of natural gas, affect the LBMP? We can of course include several kinds of energy sources, like oil and coal. - (If time allows) - * What is the relationship between electricity consumption and air quality in the NY state? (We can analyze the relation between absolute electricity usage and air quality, or the proportion of green energy and air quality. At the same time, air quality has multiple aspects, - such as CO/PM2.5/NO2, which also brings us space for in-depth discussion) + * How does the change in fuel mix during the day affect the LBMP in NY state? We can expect that LBMP will fall during daytime or sunny days and rise during night and rainy days because renewables such as solar energy can generate electricity at a cheaper price. """ ) @@ -75,9 +72,7 @@ def main() -> None: """ * NY state Energy Market & Operational Data: https://www.nyiso.com/real-time-dashboard https://mis.nyiso.com/public/P-24Alist.htm - * Oil price: https://www.eia.gov/dnav/pet/pet_pri_spt_s1_d.htm * Natural gas price: https://www.eia.gov/dnav/ng/hist/rngwhhdD.htm - * Air Quality https://www.epa.gov/outdoor-air-quality-data/download-daily-data """ ) @@ -102,9 +97,6 @@ def main() -> None: 2. External Confounding Factors Weather conditions (temperature, precipitation, solar radiation) simultaneously affect demand, renewable generation, and air quality. Failing to control for these factors may bias estimated relationships. - - 3. Air Quality Attribution - Air quality indicators such as PM2.5 or NO₂ are influenced by multiple emission sources beyond electricity generation (e.g., transportation, industrial activity). Isolating the contribution of electricity demand may therefore require additional controls or robustness checks. """ ) diff --git a/dataloading.md b/lab/dataloading.md similarity index 100% rename from dataloading.md rename to lab/dataloading.md diff --git a/research.ipynb b/research.ipynb deleted file mode 100644 index 7d65b2a..0000000 --- a/research.ipynb +++ /dev/null @@ -1,651 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "1a9e8f84", - "metadata": {}, - "source": [ - "# 1. Introduction\n", - "We are going to analyze the relationship between electricity price and the dynamics of demand and supply. Generally, higher demand is expected to correlate with increased electricity price. Based on the hourly electricity data provided by NY state, we will examine how wholesale price fluctuates in response to demand shifts.\n", - "\n", - "Another factor influencing price is the fuel mix. When a larger scale of electricity is generated from fossil fuels, price become increasingly sensitive to volatility in global energy markets, such as oil and natural gas.\n", - "\n", - "Futhermore, we also focus on the environmental externalities of energy consumption, analyzing its impact on indicators such as air quality." - ] - }, - { - "cell_type": "markdown", - "id": "065dbed0", - "metadata": {}, - "source": [ - "# 2. Potential Research Question:\n", - "* How does the Locational Based Marginal Price(LBMP) in NYC fluctuate according to the change in demand? (Although LBMP is a wholesale price, not a retail price, analyzing LBMP instead of retail price is insightful because LBMP reflects dynamic demand shifts more clearly than retail price, which does not change so often compared to the wholesale price.)\n", - "* How does the change in fuel mix during the day affect the LBMP in NY state? I can expect that LBMP will fall during daytime or sunny days and rise during night and rainy days because renewables such as solar energy can generate electricity at a cheaper price. \n", - "* How does the change in energy prices, for example, the rise of the price of natural gas, affect the LBMP? We can of course include several kinds of energy sources, like oil and coal.\n", - "\n", - "(If time allows)\n", - "* What is the relationship between electricity consumption and air quality in the NY state? (We can analyze the relation between absolute electricity usage and air quality, or the proportion of green energy and air quality. At the same time, air quality has multiple aspects, such as CO/PM2.5/NO2, which also brings us space for in-depth discussion)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "a1d98c8b", - "metadata": {}, - "source": [ - "\n", - "# 3. Data\n", - "* NY state Energy Market &\n", - "Operational Data: https://www.nyiso.com/real-time-dashboard https://mis.nyiso.com/public/P-24Alist.htm\n", - "* Oil price: https://www.eia.gov/dnav/pet/pet_pri_spt_s1_d.htm\n", - "* Natural gas price: https://www.eia.gov/dnav/ng/hist/rngwhhdD.htm\n", - "* Air Quality https://www.epa.gov/outdoor-air-quality-data/download-daily-data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f451a4a8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Time StampNamePTIDLBMP ($/MWHr)Marginal Cost Losses ($/MWHr)Marginal Cost Congestion ($/MWHr)
002/11/2026 00:05:00CAPITL6175780.463.970.00
102/11/2026 00:05:00CENTRL6175466.70-2.457.34
202/11/2026 00:05:00DUNWOD6176079.392.910.00
302/11/2026 00:05:00GENESE6175349.86-4.8221.81
402/11/2026 00:05:00H Q6184477.330.840.00
.....................
389502/11/2026 23:00:00NORTH61755128.813.880.00
389602/11/2026 23:00:00NPX61845102.144.2527.05
389702/11/2026 23:00:00O H6184692.87-12.7419.32
389802/11/2026 23:00:00PJM6184735.00-3.6386.31
389902/11/2026 23:00:00WEST61752114.94-10.000.00
\n", - "

3900 rows × 6 columns

\n", - "
" - ], - "text/plain": [ - " Time Stamp Name PTID LBMP ($/MWHr) \\\n", - "0 02/11/2026 00:05:00 CAPITL 61757 80.46 \n", - "1 02/11/2026 00:05:00 CENTRL 61754 66.70 \n", - "2 02/11/2026 00:05:00 DUNWOD 61760 79.39 \n", - "3 02/11/2026 00:05:00 GENESE 61753 49.86 \n", - "4 02/11/2026 00:05:00 H Q 61844 77.33 \n", - "... ... ... ... ... \n", - "3895 02/11/2026 23:00:00 NORTH 61755 128.81 \n", - "3896 02/11/2026 23:00:00 NPX 61845 102.14 \n", - "3897 02/11/2026 23:00:00 O H 61846 92.87 \n", - "3898 02/11/2026 23:00:00 PJM 61847 35.00 \n", - "3899 02/11/2026 23:00:00 WEST 61752 114.94 \n", - "\n", - " Marginal Cost Losses ($/MWHr) Marginal Cost Congestion ($/MWHr) \n", - "0 3.97 0.00 \n", - "1 -2.45 7.34 \n", - "2 2.91 0.00 \n", - "3 -4.82 21.81 \n", - "4 0.84 0.00 \n", - "... ... ... \n", - "3895 3.88 0.00 \n", - "3896 4.25 27.05 \n", - "3897 -12.74 19.32 \n", - "3898 -3.63 86.31 \n", - "3899 -10.00 0.00 \n", - "\n", - "[3900 rows x 6 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "electricity_price = pd.read_csv(\"data/20260211realtime_zone.csv\")\n", - "electricity_price" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fe199eae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Time StampTime ZoneNamePTIDLoad
002/11/2026 00:00:00ESTCAPITL617571491.9739
102/11/2026 00:00:00ESTCENTRL617541871.3770
202/11/2026 00:00:00ESTDUNWOD61760658.6837
302/11/2026 00:00:00ESTGENESE617531078.2270
402/11/2026 00:00:00ESTHUD VL617581234.7306
..................
276702/11/2026 20:40:00ESTMHK VL617561137.2972
276802/11/2026 20:40:00ESTMILLWD61759401.5378
276902/11/2026 20:40:00ESTN.Y.C.617616566.6143
277002/11/2026 20:40:00ESTNORTH61755770.9277
277102/11/2026 20:40:00ESTWEST617521927.7678
\n", - "

2772 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " Time Stamp Time Zone Name PTID Load\n", - "0 02/11/2026 00:00:00 EST CAPITL 61757 1491.9739\n", - "1 02/11/2026 00:00:00 EST CENTRL 61754 1871.3770\n", - "2 02/11/2026 00:00:00 EST DUNWOD 61760 658.6837\n", - "3 02/11/2026 00:00:00 EST GENESE 61753 1078.2270\n", - "4 02/11/2026 00:00:00 EST HUD VL 61758 1234.7306\n", - "... ... ... ... ... ...\n", - "2767 02/11/2026 20:40:00 EST MHK VL 61756 1137.2972\n", - "2768 02/11/2026 20:40:00 EST MILLWD 61759 401.5378\n", - "2769 02/11/2026 20:40:00 EST N.Y.C. 61761 6566.6143\n", - "2770 02/11/2026 20:40:00 EST NORTH 61755 770.9277\n", - "2771 02/11/2026 20:40:00 EST WEST 61752 1927.7678\n", - "\n", - "[2772 rows x 5 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "electricity_demand = pd.read_csv(\"data/20260211pal.csv\")\n", - "electricity_demand" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "eabab0a7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
error
codeAPI_KEY_MISSING
messageNo api_key was supplied. Please register for ...
\n", - "
" - ], - "text/plain": [ - " error\n", - "code API_KEY_MISSING\n", - "message No api_key was supplied. Please register for ..." - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import requests\n", - "\n", - "url = \"https://api.eia.gov/v2/petroleum/pri/spt/data/?frequency=daily&data[0]=value&facets[series][]=RWTC&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=5000\"\n", - "response = requests.get(url)\n", - "data = response.json()\n", - "oil_price = pd.DataFrame(data)\n", - "oil_price" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "8fa53286", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DateHenry Hub Natural Gas Spot Price (Dollars per Million Btu)
01997-01-073.82
11997-01-083.80
21997-01-093.61
31997-01-103.92
41997-01-134.00
.........
73012026-02-034.11
73022026-02-046.88
73032026-02-055.28
73042026-02-064.37
73052026-02-093.25
\n", - "

7306 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " Date Henry Hub Natural Gas Spot Price (Dollars per Million Btu)\n", - "0 1997-01-07 3.82 \n", - "1 1997-01-08 3.80 \n", - "2 1997-01-09 3.61 \n", - "3 1997-01-10 3.92 \n", - "4 1997-01-13 4.00 \n", - "... ... ... \n", - "7301 2026-02-03 4.11 \n", - "7302 2026-02-04 6.88 \n", - "7303 2026-02-05 5.28 \n", - "7304 2026-02-06 4.37 \n", - "7305 2026-02-09 3.25 \n", - "\n", - "[7306 rows x 2 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "natural_gas_price = pd.read_excel(\"data/RNGWHHDd.xls\", sheet_name=1, skiprows=2)\n", - "natural_gas_price" - ] - }, - { - "cell_type": "markdown", - "id": "1eb520fe", - "metadata": {}, - "source": [ - "# 4. Target visualization (Under revision...)\n" - ] - }, - { - "cell_type": "markdown", - "id": "93eae14a", - "metadata": {}, - "source": [ - "\n", - "# 5. Know/Unknown\n", - "#### Known:\n", - "First, wholesale electricity prices such as LBMP are highly sensitive to short-term demand fluctuations. During peak demand hours, marginal generation units with higher production costs are dispatched, leading to sharp price increases. This mechanism is well-documented in electricity market theory.\n", - "\n", - "Second, the fuel mix plays a crucial role in price formation. In NYISO, natural gas-fired plants frequently serve as marginal generators due to their operational flexibility and significant share in load-following capacity. As a result, wholesale electricity prices are expected to exhibit strong sensitivity to natural gas price fluctuations.\n", - "\n", - "\n", - "#### Unknow: \n", - "Despite these stylized facts, several important uncertainties remain.\n", - "\n", - "First, the magnitude and timing of the dynamic adjustment between demand shocks and LBMP are unclear. It is not obvious whether price responses are immediate, persistent, or asymmetric across peak and off-peak periods.\n", - "\n", - "Second, the intraday variation in fuel mix and its interaction with demand remains underexplored. It is uncertain whether renewable penetration significantly moderates price spikes or merely shifts volatility across hours.\n", - "\n", - "Third, while natural gas prices are expected to influence LBMP, the strength and stability of this pass-through over time remain empirical questions. Structural changes in the energy market may alter this relationship.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "fd8be8e4", - "metadata": {}, - "source": [ - "# 6. Expected challenges\n", - "1. High-Frequency Data Complexity\n", - "\n", - "The hourly (or sub-hourly) nature of LBMP and demand data introduces substantial volatility and noise. Short-term price spikes may obscure systematic patterns, requiring careful filtering or aggregation strategies.\n", - "\n", - "2. External Confounding Factors\n", - "\n", - "Weather conditions (temperature, precipitation, solar radiation) simultaneously affect demand, renewable generation, and air quality. Failing to control for these factors may bias estimated relationships.\n", - "\n", - "3. Air Quality Attribution\n", - "\n", - "Air quality indicators such as PM2.5 or NO₂ are influenced by multiple emission sources beyond electricity generation (e.g., transportation, industrial activity). Isolating the contribution of electricity demand may therefore require additional controls or robustness checks.\n" - ] - }, - { - "cell_type": "markdown", - "id": "565fbce4", - "metadata": {}, - "source": [ - "### Open in Colab button\n", - "\n", - " \"Open\n", - "" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv (3.13.12)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/test/test_market_analysis.py b/test/test_market_analysis.py index 4e4665e..6f548a2 100644 --- a/test/test_market_analysis.py +++ b/test/test_market_analysis.py @@ -24,35 +24,33 @@ def test_load_nyiso_realtime(): - selected_month_1 = "2025-12-01" + selected_month_1 = ["2025-12"] sql_1 = """ - SELECT Time_Stamp, Name, LBMP____MWHr_ - FROM `sipa-adv-c-dancing-cactus.dataset.market_analysis` - WHERE Time_Stamp >= '2025-12-01' - AND Time_Stamp < '2026-01-01' + SELECT hourly_time_stamp, Name, LBMP + FROM `sipa-adv-c-dancing-cactus.dataset.hourly_lbmp` + WHERE FORMAT_DATE('%Y-%m', hourly_time_stamp) = '2025-12' """ client_job = client.query(sql_1) expected_df_1 = client_job.to_dataframe() selected_month_1_actual = ( load_nyiso_realtime(selected_month_1) - .sort_values(by=["Time_Stamp", "Name"]) + .sort_values(by=["hourly_time_stamp", "Name"]) .reset_index(drop=True) ) expected_df_1_actual = expected_df_1.sort_values( - by=["Time_Stamp", "Name"] + by=["hourly_time_stamp", "Name"] ).reset_index(drop=True) pd.testing.assert_frame_equal(selected_month_1_actual, expected_df_1_actual) - selected_month_2 = "2024-05-01" + selected_month_2 = ["2024-05"] sql_2 = """ - SELECT Time_Stamp, Name, LBMP____MWHr_ - FROM `sipa-adv-c-dancing-cactus.dataset.market_analysis` - WHERE Time_Stamp >= '2024-05-01' - AND Time_Stamp < '2024-06-01' + SELECT hourly_time_stamp, Name, LBMP + FROM `sipa-adv-c-dancing-cactus.dataset.hourly_lbmp` + WHERE FORMAT_DATE('%Y-%m', hourly_time_stamp) = '2024-05' """ client_job = client.query(sql_2) @@ -60,11 +58,11 @@ def test_load_nyiso_realtime(): selected_month_2_actual = ( load_nyiso_realtime(selected_month_2) - .sort_values(by=["Time_Stamp", "Name"]) + .sort_values(by=["hourly_time_stamp", "Name"]) .reset_index(drop=True) ) expected_df_2_actual = expected_df_2.sort_values( - by=["Time_Stamp", "Name"] + by=["hourly_time_stamp", "Name"] ).reset_index(drop=True) pd.testing.assert_frame_equal(selected_month_2_actual, expected_df_2_actual) @@ -73,25 +71,31 @@ def test_load_nyiso_realtime(): def test_get_processed_electricity_data(): test_data = pd.DataFrame( { - "Time_Stamp": pd.to_datetime( + "hourly_time_stamp": pd.to_datetime( ["2026-01-01 00:00", "2026-01-01 12:00", "2026-01-02 00:00"] ), "Name": ["Zone A", "Zone A", "Zone A"], - "LBMP____MWHr_": [10.0, 20.0, 30.0], + "LBMP": [10.0, 20.0, 30.0], } ) result = get_processed_electricity_data(test_data, "Zone A") assert ( - result[result["Time_Stamp"] == pd.to_datetime("2026-01-01")]["mean"].iloc[0] + result[result["hourly_time_stamp"] == pd.to_datetime("2026-01-01")][ + "mean" + ].iloc[0] == 15.0 ) assert ( - result[result["Time_Stamp"] == pd.to_datetime("2026-01-02")]["mean"].iloc[0] + result[result["hourly_time_stamp"] == pd.to_datetime("2026-01-02")][ + "mean" + ].iloc[0] == 30.0 ) assert ( - result[result["Time_Stamp"] == pd.to_datetime("2026-01-01")]["max"].iloc[0] + result[result["hourly_time_stamp"] == pd.to_datetime("2026-01-01")]["max"].iloc[ + 0 + ] == 20.0 )