diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3bcb2d72..4ccefbfa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,10 @@ jobs: toxenv: py312 python-version: 3.12 + - name: python 3.13 + toxenv: py313 + python-version: 3.13 + steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/CHANGES.rst b/CHANGES.rst index 3e926435..51029714 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,9 +1,13 @@ 1.1.4 (unreleased) ================== +analyzer +-------- +- Minor fixes and improvements to plotting functions in explore module along with updated docstrings [#148] + builder ------- -- Retrain JWST models on data collected through January 8, 2026 (v5) and use t0 targ ID instead of 9-digit source ID for NRC WFSS Parallel_Pure [#147] +- Retrain JWST models on data collected through January 8, 2026 (v5) and use t0 targ ID instead of 9-digit source ID for NRC WFSS Parallel_Pure [#147] 1.1.3 (2025-07-08) diff --git a/README.md b/README.md index f0a50837..e24ecca9 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ $ pytest --env jwstcal -m jwst **JWST Calibration Pipeline Resource Prediction (JWST)** -[JWST CAL Docs](https://spacekit.readthedocs.io/en/latest/skopes/jwst/cal.html) +[JWST CAL Docs](https://www.spacekit.org/en/latest/skopes/jwst/cal/index.html) * Inference ``spacekit.skopes.jwst.cal.predict`` @@ -86,7 +86,7 @@ $ python -m spacekit.skopes.jwst.cal.predict /path/to/inputs --pid 1076 **Single Visit Mosaic Alignment (HST)** -[SVM Docs](https://spacekit.readthedocs.io/en/latest/skopes/hst/svm.html) +[SVM Docs](https://www.spacekit.org/en/latest/skopes/hst/svm/index.html) * Preprocessing: ``spacekit.skopes.hst.svm.prep`` * Predict Image Alignments: ``spacekit.skopes.hst.svm.predict`` @@ -97,7 +97,7 @@ $ python -m spacekit.skopes.jwst.cal.predict /path/to/inputs --pid 1076 **HST Calibration Pipeline Resource Prediction (HST)** -[HST CAL Docs](https://spacekit.readthedocs.io/en/latest/skopes/hst/cal.html) +[HST CAL Docs](https://www.spacekit.org/en/latest/skopes/hst/cal/index.html) * Training ``spacekit.skopes.hst.cal.train`` * Inference ``spacekit.skopes.hst.cal.predict`` @@ -105,7 +105,7 @@ $ python -m spacekit.skopes.jwst.cal.predict /path/to/inputs --pid 1076 **Exoplanet Detection with time-series photometry (K2, TESS)** -[K2 Docs](https://spacekit.readthedocs.io/en/latest/skopes/kepler/light-curves.html) +[K2 Docs](https://www.spacekit.org/en/latest/skopes/kepler/light_curves.html) * ``spacekit.skopes.kepler.light_curves`` diff --git a/docker/images/dashboard_image/templates/aws/.env b/docker/images/dashboard_image/templates/aws/.env index c04ae308..c23c4452 100644 --- a/docker/images/dashboard_image/templates/aws/.env +++ b/docker/images/dashboard_image/templates/aws/.env @@ -7,7 +7,6 @@ SPACEKIT_VERSION="1.1.3" SRC="s3" # pkg, s3, git, file COLLECTION="" # bucketname, repo url, or local path DATASETS="2022-02-14-1644848448,2021-11-04-1636048291,2021-10-28-1635457222" -# DATASETS="2021-11-04-1636048291,2021-10-28-1635457222,2021-08-22-1629663047" PFX="archive" # for s3 this is the folder prefix SPACEKIT_DATA=/home/developer # top-level dir where "data" folder is created CONTAINER_MODE="-it" # -d for detached, -it for interactive diff --git a/docker/images/dashboard_image/templates/latest/.env b/docker/images/dashboard_image/templates/latest/.env index 2e0aa016..7bc9d000 100644 --- a/docker/images/dashboard_image/templates/latest/.env +++ b/docker/images/dashboard_image/templates/latest/.env @@ -11,8 +11,8 @@ PFX="archive" # for s3 this is the folder prefix SPACEKIT_DATA=/home/developer # top-level dir where "data" folder is created CONTAINER_MODE="-d" # -it for interactive, -d for detached MOUNTS=0 # 1 will bind mount the below source and dest paths -SOURCEDATA="" #"$(pwd)" -DESTDATA="" #"/home/developer/spacekit" +SOURCEDATA="" # $(pwd) +DESTDATA="" # /home/developer/spacekit HOSTNAME="localhost" IPADDRESS=0.0.0.0 NAME="spkt-dash-${APP}-${VERSION}" diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js index c0ebfcb1..5e9c2d43 100644 --- a/docs/source/_static/js/custom.js +++ b/docs/source/_static/js/custom.js @@ -273,7 +273,7 @@ mouse_ball.y = e.pageY; function hideAds() { - const adIds = ["readthedocs-ea", "readthedocs-ea-text-footer"]; + const adIds = ["readthedocs-ea", "readthedocs-ea-text-footer", "readthedocs-ea-text-nostyle-nodoctool"]; adIds.forEach(element => { var rtdAd = document.getElementById(element); if (rtdAd != null && !rtdAd.hasAttribute("hidden")) { @@ -306,4 +306,6 @@ waitForVariable('ethicalads') .catch((error) => { console.error(error); // handle timeout or error - }, 2000); \ No newline at end of file + }, 2000); + +hideAds(); \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 679a0cba..ceb128c6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -3,22 +3,13 @@ # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# import os import sys import datetime import importlib -from packaging.version import Version from configparser import ConfigParser - - +# -- Path setup -------------------------------------------------------------- def setup(app): try: app.add_css_file("stsci.css") diff --git a/docs/source/overview.rst b/docs/source/overview.rst index f1e9b32e..116d6dff 100644 --- a/docs/source/overview.rst +++ b/docs/source/overview.rst @@ -9,7 +9,7 @@ the Astronomical Data Science and Machine Learning Toolkit Overview ======== -Spacekit is a python library designed to do the heavy lifting of machine learning in astronomy-related applications using . +Spacekit is a python library designed to do the heavy lifting of machine learning in astronomy-related applications. The modules contained in this package can be used to assist and streamline each step of a typical data science project: @@ -27,7 +27,7 @@ The modules contained in this package can be used to assist and streamline each Applications ------------ -The :doc:`Skøpes ` module includes real-world machine learning applications used by the Hubble and James Webb Space Telescopes in data calibration pipelines. These mini-applications are an orchestration of functions and classes from other spacekit modules to run real-time, automated analysis, training, and inference on a local server as well as in the cloud (AWS). +The :doc:`Skøpes ` module includes real-world machine learning applications used by the Hubble and James Webb Space Telescopes in data calibration pipelines. These mini-applications are an orchestration of functions and classes from other spacekit modules to run automated analysis, training, and inference in real-time on a local server or in the cloud (AWS). Indices and tables diff --git a/scripts/build.sh b/scripts/build.sh index a7ecb65a..0374d1ef 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -1,4 +1,4 @@ -#!/bin/bash -xu +#!/bin/bash -e source "./docker/images/dashboard_image/.env" DOCKER_FILE=./docker/images/dashboard_image/Dockerfile diff --git a/scripts/launch.sh b/scripts/launch.sh index cf9f7600..633f2ebe 100755 --- a/scripts/launch.sh +++ b/scripts/launch.sh @@ -1,4 +1,6 @@ -#!/bin/bash -xu +#!/bin/bash -e +# To view notebooks in -it mode: jupyter-lab --ip 0.0.0.0 + envfile="./docker/images/dashboard_image/.env" source $envfile @@ -6,15 +8,23 @@ if [ "${CONTAINER_MODE}" == "-it" ]; then EPCOMMAND="/bin/bash" fi -if [ $MOUNTS -ne 0 ]; then - docker run ${CONTAINER_MODE} \ - -p 8080:8050 -p 8888:8888 \ - --name $NAME \ - --mount type=bind,source=${SOURCEDATA},target=${DESTDATA} \ - $DOCKER_IMAGE $EPCOMMAND +existing=`docker ps -aqf "name=${NAME}"` + +if [[ -z $existing ]]; then + if [ $MOUNTS -ne 0 ]; then + docker run ${CONTAINER_MODE} \ + -p 8050:8050 \ + --name $NAME \ + --mount type=bind,source=${SOURCEDATA},target=${DESTDATA} \ + $DOCKER_IMAGE $EPCOMMAND + else + docker run ${CONTAINER_MODE} \ + -p 8050:8050 \ + --name $NAME \ + $DOCKER_IMAGE $EPCOMMAND + fi else - docker run ${CONTAINER_MODE} \ - -p 8080:8050 -p 8888:8888 \ - --name $NAME \ - $DOCKER_IMAGE $EPCOMMAND + echo "Restarting existing container: ${NAME} (ID=${existing})" + docker container start $NAME + docker container exec ${CONTAINER_MODE} $NAME $EPCOMMAND fi diff --git a/setup.cfg b/setup.cfg index 634ea3be..1241868b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ edit_on_github = False github_project = spacetelescope/spacekit project_urls = Source = https://github.com/spacetelescope/spacekit - Documentation = https://spacekit.readthedocs.io + Documentation = https://www.spacekit.org Tracker = https://github.com/spacetelescope/spacekit/issues classifiers= Intended Audience :: Science/Research @@ -24,6 +24,7 @@ classifiers= Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 [options] zip_safe = False @@ -37,7 +38,7 @@ install_requires = tensorflow>=2.16.1 astropy boto3 - numpy>=1.22 + numpy>=1.25 pandas scikit-learn>=1.3.2 diff --git a/spacekit/analyzer/explore.py b/spacekit/analyzer/explore.py index 8e4a1d8e..14dfdc0a 100644 --- a/spacekit/analyzer/explore.py +++ b/spacekit/analyzer/explore.py @@ -1,5 +1,6 @@ -# STANDARD libraries import os +import re +import warnings import numpy as np import pandas as pd from scipy.stats import iqr @@ -43,10 +44,27 @@ px = None +try: + from astropy.timeseries import TimeSeries, BoxLeastSquares, aggregate_downsample + from astropy import units as u + from astropy.stats import sigma_clipped_stats + from astropy.io import fits +except ImportError: + TimeSeries = None + + +def check_ast_imports(): + return TimeSeries is not None + + def check_viz_imports(): return go is not None +def check_mpl_imports(): + return mpl is not None and plt is not None + + class ImagePreviews: """Base parent class for rendering and displaying images as plots""" @@ -241,7 +259,7 @@ def preview_og_syn_pair(self, img_name): class DataPlots: - """Parent class for drawing exploratory data analysis plots from a dataframe.""" + """Base class for drawing exploratory data analysis plots from a dataframe.""" def __init__( self, @@ -250,6 +268,7 @@ def __init__( height=700, show=False, save_html=None, + telescope=None, name="DataPlots", **log_kws, ): @@ -260,21 +279,23 @@ def __init__( self.height = height self.show = show self.save_html = save_html + self.telescope = telescope self.target = None # target (y) name e.g. "label", "memory", "wallclock" self.labels = None # self.classes = None # target classes e.g. [0,1] or [0,1,2,3] self.n_classes = None - self.group = None # e.g. "detector" or "instr" + self.group = None # e.g. "detector", "instr", "cat" self.gkeys = None + self.group_dict = None self.categories = None self.cmap = ["dodgerblue", "gold", "fuchsia", "lime"] self.continuous = None self.categorical = None self.feature_list = None - self.telescope = None self.figures = None self.scatter = None self.bar = None + self.box = None self.groupedbar = None self.kde = None if not check_viz_imports(): @@ -287,46 +308,89 @@ def __init__( ) def group_keys(self): - if self.group in ["instr", "instrument"]: - keys = ["acs", "cos", "stis", "wfc3"] - elif self.group in ["det", "detector"]: - uniq = list(self.df[self.group].unique()) - if len(uniq) == 2: - keys = ["wfc-uvis", "other"] - else: - keys = ["hrc", "ir", "sbc", "uvis", "wfc"] - # TODO: target classification / "category" - elif self.group in ["cat", "category"]: - keys = [ - "calibration", - "galaxy", - "galaxy_cluster", - "ISM", - "star", - "stellar_cluster", - "unidentified", - ] - # TODO: filters - group_keys = dict(enumerate(keys)) - return group_keys - - def map_data(self): - """Instantiates grouped dataframes for each detector + """Generates numerically ordered key-pairs for each unique value of self.group found in the dataframe Returns ------- dict - data_map dictionary of grouped data frames and color map + enumerated dictionary of unique values for each group """ - if self.cmap is None: - cmap = ["#119dff", "salmon", "#66c2a5", "fuchsia", "#f4d365"] + if not self.group: + self.log.error( + "Cannot generate group keys if no grouping feature specified. Set the `group` attribute then try again." + ) + if self.group.startswith("instr"): + return dict(enumerate(self.instr_keys())) + elif self.group.startswith("det"): + return dict(enumerate(self.det_keys())) + elif self.group.startswith("cat"): + return dict(enumerate(self.targ_class_keys())) else: - cmap = self.cmap + return dict(enumerate(sorted(list(self.df[self.group].unique())))) + + def instr_keys(self): + """Generates a list of intruments based on self.telescope + + Returns + ------- + list + list of instrument keys for the specified telescope + """ + if self.telescope not in ["hst", "jwst"]: + return [] + return dict(hst=["acs", "wfc3", "cos", "stis"], jwst=["fgs", "miri", "nircam", "niriss", "nirspec"])[ + self.telescope.lower() + ] + + def det_keys(self): + """Creates a list of detectors based on self.telescope + + Returns + ------- + list + list of detector keys for the specified telescope + """ + keys = sorted(list(self.df[self.group].unique())) + if self.telescope.lower() == "hst": + if len(keys) == 2: + return ["wfc-uvis", "other"] + if not isinstance(keys[0], str) and len(keys) == 5: + return ["hrc", "ir", "sbc", "uvis", "wfc"] + return keys + + def targ_class_keys(self): + """List of standard astronomical target classification categories + + Returns + ------- + list + standard target classification categories + """ + return [ + "calibration", + "galaxy", + "galaxy_cluster", + "ISM", + "star", + "stellar_cluster", + "unidentified", + ] + + def map_df_by_group(self): + """Instantiates `group_dict` as a dictionary of grouped dataframes and color map""" + self.group_dict = {} + for k, v in self.gkeys.items(): + self.group_dict[v] = [self.df.groupby(self.group).get_group(k), self.cmap[k]] + + def map_data(self): + """Instantiates `data_map` as a dictionary of grouped dataframes and color maps for each category in `categories` attribute.""" + cmap = ["#119dff", "salmon", "#66c2a5", "fuchsia", "#f4d365"] if self.cmap is None else self.cmap + if not self.categories: + self.feature_subset() self.data_map = {} for key, name in self.gkeys.items(): data = self.categories[name] self.data_map[name] = dict(data=data, color=cmap[key]) - return self.data_map def feature_subset(self): """Create a set of groups from a categorical feature (dataframe column). Used for plotting multiple traces on a figure @@ -339,10 +403,8 @@ def feature_subset(self): self.categories = {} feature_groups = self.df.groupby(self.group) for i in list(range(len(feature_groups))): - dx = feature_groups.get_group(i) k = self.gkeys[i] - self.categories[k] = dx - return self.categories + self.categories[k] = feature_groups.get_group(i) def feature_stats_by_target(self, feature): """Calculates statistical info (mean and standard deviation) for a feature within each target class. @@ -369,6 +431,30 @@ def feature_stats_by_target(self, feature): return means, errs def make_subplots(self, figtype, xtitle, ytitle, data1, data2, name1, name2): + """Generates figure with multiple subplots for two sets of data using previously generated figures. + + Parameters + ---------- + figtype : str + type of figure being generated (used for saving html file) + xtitle : str + title for the x-axis + ytitle : str + title for the y-axis + data1 : go.Figure + figure object for the first set of data + data2 : go.Figure + figure object for the second set of data + name1 : str + name for the first subplot + name2 : str + name for the second subplot + + Returns + ------- + go.Figure + figure object containing the subplots + """ fig = subplots.make_subplots( rows=1, cols=2, @@ -401,7 +487,7 @@ def make_subplots(self, figtype, xtitle, ytitle, data1, data2, name1, name2): pyo.plot(fig, filename=f"{self.save_html}/{figtype}_{self.name1}_vs_{self.name2}") return fig - def make_scatter_figs( + def make_target_scatter_figs( self, xaxis_name, yaxis_name, @@ -410,6 +496,27 @@ def make_scatter_figs( categories=None, target=None, ): + """Generates scatterplots for two features in the dataframe, grouped by target classes. + + Parameters + ---------- + xaxis_name : str + column name in dataframe to plot on x-axis + yaxis_name : str + column name in dataframe to plot on y-axis + marker_size : int, optional + marker size for scatter plot points, by default 15 + cmap : list, optional + list of colors for different target classes, by default ["cyan", "fuchsia"] + categories : dict, optional + dictionary of categories to group data by, by default None + target : str, optional + name of target column in dataframe, by default None + Returns + ------- + list + list of scatterplot figures for each category + """ if categories is None: categories = {"all": self.df} if target is None: @@ -456,12 +563,67 @@ def make_scatter_figs( scatter_figs.append(fig) return scatter_figs + def make_feature_scatter_figs(self, xaxis_name, yaxis_name): + """Generates scatterplots for two features in the dataframe, grouped by the `group` attribute. + + Parameters + ---------- + xaxis_name : str + name of column in dataframe to plot on x-axis + yaxis_name : str + name of column in dataframe to plot on y-axis + + Returns + ------- + list + scatterplot figures for each group in self.group attribute + """ + if self.data_map is None: + self.map_data() + scatter_figs = [] + for key, datacolor in self.data_map.items(): + data = datacolor["data"] + color = datacolor["color"] + trace = go.Scatter( + x=data[xaxis_name], + y=data[yaxis_name], + text=data.index, + mode="markers", + opacity=0.7, + marker={"size": 15, "color": color}, + name=key, + ) + layout = go.Layout( + xaxis={"title": xaxis_name}, + yaxis={"title": yaxis_name}, + title=key, + hovermode="closest", + paper_bgcolor="#242a44", + plot_bgcolor="#242a44", + font={"color": "#ffffff"}, + ) + fig = go.Figure(data=trace, layout=layout) + scatter_figs.append(fig) + return scatter_figs + def make_target_scatter(self, target=None): + """Generates target vs feature scatterplot for a given target (by default self.target) for each feature in self.feature_list. + + Parameters + ---------- + target : str, optional + target column name, by default None + + Returns + ------- + list + target-feature scatterplot figures for each feature in self.feature_list + """ if target is None: target = self.target target_figs = {} for f in self.feature_list: - target_figs[f] = self.make_scatter_figs(f, target) + target_figs[f] = self.make_target_scatter_figs(f, target) return target_figs def bar_plots( @@ -474,6 +636,30 @@ def bar_plots( height=500, cmap=["dodgerblue", "fuchsia"], ): + """Draws a bar plot for a feature, grouped by the `group` attribute. + + Parameters + ---------- + X : array-like + X-axis values + Y : array-like + Y-axis values + feature : str + Feature name + y_err : list, optional + Y-axis error values, by default [None, None] + width : int, optional + Width of the plot, by default 700 + height : int, optional + Height of the plot, by default 500 + cmap : list, optional + List of colors for the plot, by default ["dodgerblue", "fuchsia"] + + Returns + ------- + go.Figure + Plotly Figure object representing the bar plot + """ traces = [] for i in self.classes: i = int(i) @@ -517,6 +703,32 @@ def kde_plots( height=500, cmap=["#F66095", "#2BCDC1"], ): + """Generates KDE plots for specified columns in the dataframe. + + Parameters + ---------- + cols : list of str + List of column names to generate KDE plots for + norm : bool, optional + Whether to normalize the data, by default False + targets : bool, optional + Whether to group data by target classes, by default False + hist : bool, optional + Whether to show histogram, by default True + curve : bool, optional + Whether to show KDE curve, by default True + binsize : float, optional + Bin size for the histogram, by default 0.2 + height : int, optional + Height of the plot, by default 500 + cmap : list, optional + List of colors for the plot, by default ["#F66095", "#2BCDC1"] + + Returns + ------- + go.Figure + Plotly Figure object representing the KDE plot + """ if norm is True: df = PowerX(self.df, cols=cols, join_data=True).Xt cols = [c + "_scl" for c in cols] @@ -561,6 +773,26 @@ def kde_plots( return fig def scatter3d(self, x, y, z, mask=None, target=None): + """Generates a 3D scatterplot for three features in the dataframe. + + Parameters + ---------- + x : str + feature column name for x-axis + y : str + feature column name for y-axis + z : str + feature column name for z-axis + mask : pd.DataFrame, optional + DataFrame to use as a mask/filter, by default None + target : str, optional + target column name, by default None + + Returns + ------- + go.Figure + Plotly Figure object representing the 3D scatterplot + """ if mask is None: df = self.df else: @@ -595,6 +827,18 @@ def scatter3d(self, x, y, z, mask=None, target=None): return fig def remove_outliers(self, y_data): + """Removes outliers from a given pandas Series using the IQR method. + + Parameters + ---------- + y_data : pd.Series + The data from which to remove outliers. + + Returns + ------- + pd.Series + The data with outliers removed via IQR filtering. + """ q = y_data.quantile([0.25, 0.75]).values q1, q3 = q[0], q[1] lower_fence = q1 - 1.5 * iqr(y_data) @@ -603,12 +847,23 @@ def remove_outliers(self, y_data): return y def box_plots(self, cols=None, outliers=True): + """Generates multi-trace box plots for each feature in cols param, with or without outliers + + Parameters + ---------- + cols : list, optional + features to plot from dataframe, by default None (uses self.continuous attribute) + outliers : bool, optional + whether to include outliers in the box plots, by default True + + Returns + ------- + dict + dictionary of plotly box plot figures for each feature in cols parameter + """ box = {} title_sfx = "" - if cols is None: - features = self.continuous - else: - features = cols + features = cols or self.continuous for f in features: traces = [] for i, name in enumerate(self.gkeys.values()): @@ -630,10 +885,53 @@ def box_plots(self, cols=None, outliers=True): box[f] = fig return box - def grouped_barplot(self, target="label", cmap=None, save=False): + def make_box_figs(self, vars: list): + """Generates single trace box plots, one plot for each var where `vars` is a list of columns in df + + Parameters + ---------- + vars : list + column names in dataframe to plot + + Returns + ------- + list + list of plotly box plot figures for each variable in vars parameter + """ + box_figs = [] + if not self.group_dict: + self.map_df_by_group() + for v in vars: + data = [go.Box(y=j[0][v], name=i) for i, j in self.group_dict.items()] + layout = go.Layout( + title=f"{v} by {self.group}", + hovermode="closest", + paper_bgcolor="#242a44", + plot_bgcolor="#242a44", + font={"color": "#ffffff"}, + ) + fig = go.Figure(data=data, layout=layout) + box_figs.append(fig) + return box_figs + + def grouped_barplot(self, target="label", cmap=None): + """Draws a grouped bar plot for a target column, grouped by the `group` attribute. + + Parameters + ---------- + target : str, optional + target column to plot, by default "label" + cmap : list, optional + list of colors for the bars, by default None + + Returns + ------- + go.Figure + plotly figure object for the grouped bar plot + """ df = self.df if cmap is None: - cmap = ["red", "orange", "yellow", "purple", "blue"] + cmap = self.cmap or ["red", "orange", "yellow", "purple", "blue"] groups = df.groupby([self.group])[target] traces = [] for key, value in self.gkeys.items(): @@ -659,46 +957,34 @@ class HstSvmPlots(DataPlots): spacekit.analyzer.explore.DataPlots parent class """ - def __init__( - self, - df, - group="det", - width=1300, - height=700, - show=False, - save_html=None, - **log_kws, - ): + def __init__(self, df, group="det", width=1300, height=700, show=False, save_html=None, **log_kws): super().__init__( df, width=width, height=height, show=show, save_html=save_html, + telescope="hst", name="HstSvmPlots", **log_kws, ) self.group = group - self.telescope = "HST" self.target = "label" self.classes = list(set(df[self.target].values)) # [0, 1] self.labels = ["aligned", "misaligned"] self.n_classes = len(set(self.labels)) - self.gkeys = super().group_keys() - self.categories = self.feature_subset() + self.gkeys = self.group_keys() + self.cmap = ["#119dff", "salmon", "#66c2a5", "fuchsia", "#f4d365"] + self.feature_subset() self.continuous = ["rms_ra", "rms_dec", "gaia", "nmatches", "numexp"] self.categorical = ["det", "wcs", "cat"] self.feature_list = self.continuous + self.categorical - self.cmap = ["#119dff", "salmon", "#66c2a5", "fuchsia", "#f4d365"] - self.df_by_detector() - self.bar = None - self.scatter = None - self.kde = None + self.map_df_by_group() def draw_plots(self): - self.bar = self.alignment_bars() - self.scatter = self.alignment_scatters() - self.kde = self.alignment_kde() + self.alignment_bars() + self.alignment_scatters() + self.alignment_kde() def alignment_bars(self): self.bar = {} @@ -707,13 +993,11 @@ def alignment_bars(self): means, errs = self.feature_stats_by_target(f) bar = self.bar_plots(X, means, f, y_err=errs) self.bar[f] = bar - return self.bar def alignment_scatters(self): - rms_scatter = self.make_scatter_figs("rms_ra", "rms_dec", categories=self.categories) - source_scatter = self.make_scatter_figs("point", "segment", categories=self.categories) + rms_scatter = self.make_target_scatter_figs("rms_ra", "rms_dec", categories=self.categories) + source_scatter = self.make_target_scatter_figs("point", "segment", categories=self.categories) self.scatter = {"rms_ra_dec": rms_scatter, "point_segment": source_scatter} - return self.scatter def alignment_kde(self): cols = self.continuous @@ -723,64 +1007,29 @@ def alignment_kde(self): for i, c in enumerate(cols): self.kde["targ"][c] = targ[i] self.kde["norm"][c] = norm[i] - return self.kde - - # def group_keys(self): - # if self.group in ["det", "detector"]: - # keys = ["hrc", "ir", "sbc", "uvis", "wfc"] - # elif self.group in ["cat", "category"]: - # keys = [ - # "calibration", - # "galaxy", - # "galaxy_cluster", - # "ISM", - # "star", - # "stellar_cluster", - # "unidentified", - # ] - # group_keys = dict(enumerate(keys)) - # return group_keys - - def df_by_detector(self): - """Instantiates grouped dataframes for each detector - - Returns - ------- - self - """ - try: - self.hrc = self.df.groupby("det").get_group(0) - self.ir = self.df.groupby("det").get_group(1) - self.sbc = self.df.groupby("det").get_group(2) - self.uvis = self.df.groupby("det").get_group(3) - self.wfc = self.df.groupby("det").get_group(4) - self.instr_dict = { - "hrc": [self.hrc, "#119dff"], # lightblue - "ir": [self.ir, "salmon"], - "sbc": [self.sbc, "#66c2a5"], # lightgreen - "uvis": [self.uvis, "fuchsia"], - "wfc": [self.wfc, "#f4d365"], # softgold - } - except Exception as e: - print(e) - return self class HstCalPlots(DataPlots): - def __init__(self, df, group="instr", **log_kws): - super().__init__(df, name="HstCalPlots", **log_kws) - self.telescope = "HST" + def __init__(self, df, group="instr", width=1300, height=700, show=False, save_html=None, **log_kws): + super().__init__( + df, + width=width, + height=height, + show=show, + save_html=save_html, + telescope="hst", + name="HstCalPlots", + **log_kws, + ) self.target = "mem_bin" self.classes = [0, 1, 2, 3] self.group = group self.labels = ["2g", "8g", "16g", "64g"] self.gkeys = self.group_keys() - self.categories = self.feature_subset() - self.acs = None - self.cos = None - self.stis = None - self.wfc3 = None - self.instr_dict = None + self.group_dict = {} + self.cmap = ["dodgerblue", "gold", "fuchsia", "lime"] + self.data_map = None + self.feature_subset() self.instruments = list(self.df["instr_key"].unique()) self.continuous = ["n_files", "total_mb", "x_files", "x_size"] self.categorical = [ @@ -793,27 +1042,10 @@ def __init__(self, df, group="instr", **log_kws): "instr", ] self.feature_list = self.continuous + self.categorical - self.cmap = ["dodgerblue", "gold", "fuchsia", "lime"] - self.data_map = None - self.scatter = None - self.box = None self.scatter3 = None - def df_by_instr(self): - self.acs = self.df.groupby("instr").get_group(0) - self.cos = self.df.groupby("instr").get_group(1) - self.stis = self.df.groupby("instr").get_group(2) - self.wfc3 = self.df.groupby("instr").get_group(3) - self.instr_dict = { - "acs": [self.acs, "#119dff"], - "wfc3": [self.wfc3, "salmon"], - "cos": [self.cos, "#66c2a5"], - "stis": [self.stis, "fuchsia"], - } - return self - def draw_plots(self): - self.scatter = self.make_cal_scatterplots() + self.make_cal_scatterplots() self.box = self.box_plots() box_target = self.box_plots(cols=["memory", "wallclock"]) box_fenced = self.box_plots(cols=["memory", "wallclock"], outliers=False) @@ -828,333 +1060,473 @@ def draw_plots(self): def make_cal_scatterplots(self): memory_figs, wallclock_figs = {}, {} for f in self.feature_list: - memory_figs[f] = self.make_scatter_figs(f, "memory") - wallclock_figs[f] = self.make_scatter_figs(f, "wallclock") + memory_figs[f] = self.make_feature_scatter_figs(f, "memory") + wallclock_figs[f] = self.make_feature_scatter_figs(f, "wallclock") self.scatter = dict(memory=memory_figs, wallclock=wallclock_figs) - return self.scatter def make_cal_scatter3d(self): x, y = "memory", "wallclock" self.scatter3 = {} for z in self.continuous: data = self.df[[x, y, z, "instr_key"]] - scat3d = super().scatter3d(x, y, z, mask=data, target="instr_key", width=700, height=700) - self.scatter3[z] = scat3d - - def make_box_figs(self, vars): - box_figs = [] - for v in vars: - data = [ - go.Box(y=self.acs[v], name="acs"), - go.Box(y=self.cos[v], name="cos"), - go.Box(y=self.stis[v], name="stis"), - go.Box(y=self.wfc3[v], name="wfc3"), - ] - layout = go.Layout( - title=f"{v} by instrument", - hovermode="closest", - paper_bgcolor="#242a44", - plot_bgcolor="#242a44", - font={"color": "#ffffff"}, - ) - fig = go.Figure(data=data, layout=layout) - box_figs.append(fig) - return box_figs - - def make_scatter_figs(self, xaxis_name, yaxis_name): - if self.data_map is None: - self.map_data() - scatter_figs = [] - for instr, datacolor in self.data_map.items(): - data = datacolor["data"] - color = datacolor["color"] - trace = go.Scatter( - x=data[xaxis_name], - y=data[yaxis_name], - text=data.index, - mode="markers", - opacity=0.7, - marker={"size": 15, "color": color}, - name=instr, - ) - layout = go.Layout( - xaxis={"title": xaxis_name}, - yaxis={"title": yaxis_name}, - title=instr, - hovermode="closest", - paper_bgcolor="#242a44", - plot_bgcolor="#242a44", - font={"color": "#ffffff"}, - ) - fig = go.Figure(data=trace, layout=layout) - scatter_figs.append(fig) - return scatter_figs + self.scatter3[z] = super().scatter3d(x, y, z, mask=data, target="instr_key", width=700, height=700) class SignalPlots: - @staticmethod - def atomic_vector_plotter( - signal, - label_col=None, - classes=None, - class_names=None, - figsize=(15, 5), - y_units=None, - x_units=None, + """Class for plotting time series signals and their spectrograms.""" + + def __init__( + self, + show=False, + save_png=False, + target_cns={}, + color_map={}, + output_dir=None, + name="SignalPlots", + **log_kws, ): + """Class for manipulating and plotting time series signals and frequency spectrograms. + + Parameters + ---------- + show : bool, optional + display plot, by default False + save_png : str, optional + save plot as PNG file, by default False + target_cns : dict, optional + target label and string keypairs, by default {} + color_map : dict, optional + target label and color keypairs, by default {} """ - Plots scatter and line plots of time series signal values. - - **ARGS - signal: pandas series or numpy array - label_col: name of the label column if using labeled pandas series - -use default None for numpy array or unlabeled series. - -this is simply for customizing plot Title to include classification - classes: (optional- req labeled data) tuple if binary, array if multiclass - class_names: tuple or array of strings denoting what the classes mean - figsize: size of the figures (default = (15,5)) - - ****** - - Ex1: Labeled timeseries passing 1st row of pandas dataframe - > first create the signal: - signal = x_train.iloc[0, :] - > then plot: - atomic_vector_plotter(signal, label_col='LABEL',classes=[1,2], - class_names=['No Planet', 'Planet']), figsize=(15,5)) - - Ex2: numpy array without any labels - > first create the signal: - signal = x_train.iloc[0, :] - - >then plot: - atomic_vector_plotter(signal, figsize=(15,5)) + self.__name__ = name + self.log = Logger(self.__name__, **log_kws).spacekit_logger() + self.show = show + self.save_png = save_png + self.target_cns = target_cns + self.color_map = color_map + self.flux_col = "pdcsap_flux" + self.extra_cols = ["lc_start", "lc_end", "maxpower", "transit", "mean", "median", "stddev"] + self.output_dir = os.getcwd() if output_dir is None else output_dir + self.check_dependencies() + warnings.filterwarnings(action="ignore") # ignore astropy warnings + + def check_dependencies(self): + if not check_ast_imports() or not check_mpl_imports(): + self.log.error("astropy and/or matplotlib not installed.") + raise ImportError( + "You must have astropy and matplotlib installed " + f"for the {self.__name__} class to work." + "\n\nInstall extra deps via `pip install spacekit[x]`" + ) + + def parse_filename(self, fname, fmt="kepler.fits"): + """Extracts target information from FITS light curve file name. + + Parameters + ---------- + fname : str + path to FITS light curve file (llc or lc) + fmt : str, optional + 'kepler.fits' or 'tess.fits', by default "kepler.fits" + + Returns + ------- + tuple + target id (str), campaign/sector id (str) """ - import pandas as pd - import numpy as np - - # pass None to label_col if unlabeled data, creates generic title - if label_col is None: - label = None - title_scatter = "Scatterplot of Star Flux Signals" - title_line = "Line Plot of Star Flux Signals" - color = "black" - - # store target column as variable - elif label_col is not None: - label = signal[label_col] - # for labeled timeseries - if label == 1: - cn = class_names[0] - color = "red" - - elif label == 2: - cn = class_names[1] - color = "blue" - # TITLES - # create appropriate title acc to class_names - title_scatter = f"Scatterplot for Star Flux Signal: {cn}" - title_line = f"Line Plot for Star Flux Signal: {cn}" - - # Set x and y axis labels according to units - # if the units are unknown, we will default to "Flux" - if y_units is None: - y_units = "Flux" - else: - y_units = y_units - # it is assumed this is a timeseries, default to "time" - if x_units is None: - x_units = "Time" + fname = os.path.basename(fname) + if fmt == "kepler.fits": # r"ktwo{obs_id}-c{campaign}_llc.fits" + patt = r"ktwo(\d{9,15})-c(\d{2})_llc\.fits" + m = re.match(patt, fname) + if m: + return (m.group(1), m.group(2)) # tid, campaign + elif fmt == "tess.fits": # r"tess{date-time}-s{sctr}-{tid}-{scid}-{cr}_lc.fits" + patt = r"^tess(\d{13})-s(\d{4})-(\d{16,20})-(\d{4})-s_lc\.fits$" + m = re.match(patt, fname) + if m: + return (m.group(2), m.group(1)) # tid, sector else: - x_units = x_units - - # Scatter Plot - if isinstance(signal, np.array): - series_index = list(range(len(signal))) + raise ValueError("fmt must be 'kepler.fits' or 'tess.fits'") + raise ValueError("Filename does not match expected pattern") - converted_array = pd.Series(signal.ravel(), index=series_index) - signal = converted_array + @staticmethod + def read_ts_signal(fits_file, signal_col="pdcsap_flux", fmt="kepler.fits", offset=False, remove_nans=True): + """Reads time series signal data from a FITS light curve file (_llc.fits or _lc.fits for kepler and fits respectively). Optionally can + apply telescope-specific BJD offset as determined by `fmt` kwarg (most light curve files already have this applied) and remove NaN values from both signal and corresponding timestamp arrays. Regarding the `signal_col` defaults: "sap_flux" is Simple Aperture Photometry flux, the flux after summing the calibrated pixels within the telescope's optimal photometric aperture; the default (recommended) is "pdcsap_flux" (Pre-search Data Conditioned Simple Aperture Photometry, the SAP flux values nominally corrected for instrumental variations - these are the mission's best estimate of the intrinsic variability of the target.). - plt.figure(figsize=figsize) - plt.scatter( - pd.Series([i for i in range(1, len(signal))]), - signal[1:], - marker=4, - color=color, - ) - plt.ylabel(y_units) - plt.xlabel(x_units) - plt.title(title_scatter) - plt.show() + Parameters + ---------- + fits_file : str + path to FITS light curve file (llc or lc) + signal_col : str, optional + header column name containing the data, by default "pdcsap_flux" + fmt : str, optional + 'kepler.fits' or 'tess.fits', by default "kepler.fits" + offset : bool, optional + apply telescope-specifc BJD offset to timestamps, by default False + remove_nans : bool, optional + remove NaN values from signal and timestamps, by default True - # Line Plot - plt.figure(figsize=figsize) - plt.plot(pd.Series([i for i in range(1, len(signal))]), signal[1:], color=color) - plt.ylabel(y_units) - plt.xlabel(x_units) - plt.title(title_line) - plt.show() + Returns + ------- + np.ndarray + time series signal data as a numpy array + """ + if fmt not in ["kepler.fits", "tess.fits"]: + raise ValueError("fmt must be 'kepler.fits' or 'tess.fits'") + ts = TimeSeries.read(fits_file, format=fmt) + flux = np.asarray(ts[signal_col], dtype="float64") + timestamps = ts.time.jd + if offset is True: + bjd = dict(kepler=2454833.0, tess=2457000.0)[fmt.split(".")[0]] + timestamps -= bjd # convert to KBJD/TBJD + if remove_nans is True: + not_nan_mask = ~np.isnan(flux) + flux = flux[not_nan_mask] + timestamps = timestamps[not_nan_mask] + return timestamps, flux - @staticmethod - def flux_specs( + def atomic_vector_plotter( + self, signal, - Fs=2, - NFFT=256, - noverlap=128, - mode="psd", - cmap=None, - units=None, - colorbar=False, - save_for_ML=False, - fname=None, - num=None, - **kwargs, + timestamps=None, + label=None, + y_units="PDCSAP Flux (e-/s)", # aperture photometry flux + x_units="Time (BJD)", # Barycentric Julian Date + figsize=(15, 10), + fname="flux_signal.png", + title_pfx="Flux Signal", ): - """generate and save spectographs of flux signal frequencies""" - import matplotlib.pyplot as plt + """Plots scatter and line plots of time series signal values. - if cmap is None: - cmap = "binary" - - # PIX: plots only the pixelgrids -ideal for image classification - if save_for_ML is True: - # turn off everything except pixel grid - fig, ax = plt.subplots(figsize=(10, 10), frameon=False) - fig, freqs, t, m = plt.specgram(signal, Fs=Fs, NFFT=NFFT, mode=mode, cmap=cmap) - ax.axis(False) - ax.show() - - if fname is not None: - try: - if num: - path = fname + num - else: - path = fname - plt.savefig(path, **kwargs) - except Exception as e: - print("Something went wrong while saving the img file") - print(e) - - else: - fig, ax = plt.subplots(figsize=(13, 11)) - fig, freqs, t, m = plt.specgram(signal, Fs=Fs, NFFT=NFFT, mode=mode, cmap=cmap) - plt.colorbar() - if units is None: - units = ["Wavelength (λ)", "Frequency (ν)"] - plt.xlabel(units[0]) - plt.ylabel(units[1]) - if num: - title = f"Spectrogram_{num}" - else: - title = "Spectrogram" - plt.title(title) + Parameters + ---------- + signal : np.ndarray or pandas Series + time series signal data + y_units : str, optional + y-axis label, by default "PDCSAP Flux (e-/s)" + x_units : str, optional + x-axis label, by default "Time (BJD)" + """ + cn = self.target_cns.get(label, "") + color = self.color_map.get(label, "black") + title = title_pfx + f": {cn}" if cn != "" else title_pfx + if timestamps is None: + timestamps = list(range(len(signal))) + x_units = "Time Cadence Index" + fig, axs = plt.subplots(nrows=2, ncols=1, figsize=figsize, sharex=True) + axs[0].plot( + timestamps, + signal, + color=color, + ) + axs[0].set_ylabel(y_units) + axs[1].scatter( + timestamps, + signal, + marker=4, + color=color, + ) + axs[1].set_ylabel(y_units) + plt.xlabel(x_units) + plt.suptitle(title) + fig.tight_layout() + if self.save_png: + fpath = str(os.path.join(self.output_dir, fname)) + ".png" + fig.savefig(fpath, dpi=300) + if self.show: plt.show() + else: + plt.close() - return fig, freqs, t, m + def signal_phase_folder(self, file_list, fmt="kepler.fits", error=True, snr=True, include_extra=False): + """Generates phase-folded light curves from LLC/LCF flux signals - @staticmethod - def singal_phase_folder(file_list, fmt="kepler.fits", error=False, snr=False): - """plots phase-folded light curve of a signal - returns dataframe of transit timestamps for each light curve - planet_hunter(f=files[9], fmt='kepler.fits') - - args: - - fits_files = takes array of files or single .fits file - - kwargs: - - format : 'kepler.fits' or 'tess.fits' - - error: include SAP flux error (residuals) if available - - snr: apply signal-to-noise-ratio to periodogram autopower calculation + Parameters + ---------- + file_list : list + list of FITS file path(s) containing time series data + flux_col : str, optional + header column name containing the data, by default "pdcsap_flux" + fmt : str, optional + 'kepler.fits' or 'tess.fits', by default "kepler.fits" + error : bool, optional + include SAP flux error (residuals) if available, by default True + snr : bool, optional + apply signal-to-noise-ratio to periodogram autopower calculation, by default True + + Returns + ------- + pd.DataFrame + transit timestamps and phase folded flux values for each light curve """ - from astropy.timeseries import TimeSeries - import numpy as np - from astropy import units as u - from astropy.timeseries import BoxLeastSquares - from astropy.stats import sigma_clipped_stats - from astropy.timeseries import aggregate_downsample - - # read in file + # req_cols = ["obs_id", "campaign", "time_jd", "sap_flux_norm", "time_bin_start", "sap_flux_norm_binned", "period"] transits = {} for index, file in enumerate(file_list): res = {} - if fmt == "kepler.fits": - prefix = file.replace("ktwo", "") - suffix = prefix.replace("_llc.fits", "") - pair = suffix.split("-") - obs_id = pair[0] - campaign = pair[1] - + fname = os.path.basename(file) + (tid, sc) = self.parse_filename(fname, fmt=fmt) ts = TimeSeries.read(file, format=fmt) # read in timeseries - # add to meta dict - res["obs_id"] = obs_id - res["campaign"] = campaign - res["lc_start"] = ts.time.jd[0] - res["lc_end"] = ts.time.jd[-1] - + res["tid"] = tid + res["sc"] = sc # use box least squares to estimate period - if error is True: # if error col data available - periodogram = BoxLeastSquares.from_timeseries(ts, "sap_flux", "sap_flux_err") + if error is True and f"{self.flux_col}_err" in ts.columns: + periodogram = BoxLeastSquares.from_timeseries(ts, self.flux_col, f"{self.flux_col}_err") else: - periodogram = BoxLeastSquares.from_timeseries(ts, "sap_flux") + periodogram = BoxLeastSquares.from_timeseries(ts, self.flux_col) if snr is True: results = periodogram.autopower(0.2 * u.day, objective="snr") else: results = periodogram.autopower(0.2 * u.day) - maxpower = np.argmax(results.power) period = results.period[maxpower] + res["period"] = period transit_time = results.transit_time[maxpower] + # fold the time series using the period + ts_folded = ts.fold(period=period, epoch_time=transit_time) + res["time_jd"] = ts_folded.time.jd + # normalize the flux by sigma-clipping the data to determine the baseline flux: + mean, median, stddev = sigma_clipped_stats(ts_folded[self.flux_col]) + ts_folded["flux_norm"] = ts_folded[self.flux_col] / median + res["flux_norm"] = ts_folded["flux_norm"] + # downsample the time series by binning the points into bins of equal time + ts_binned = aggregate_downsample(ts_folded, time_bin_size=0.03 * u.day) + res["time_bin_start"] = ts_binned.time_bin_start.jd + res["flux_norm_binned"] = ts_binned["flux_norm"] + if include_extra: + res["lc_start"] = ts.time.jd[0] + res["lc_end"] = ts.time.jd[-1] + res["transit"] = transit_time + res["maxpower"] = maxpower + res["mean"] = mean + res["median"] = median + res["stddev"] = stddev + res["fname"] = fname + transits[index] = res + df = pd.DataFrame.from_dict(transits, orient="index") + return df - res["maxpower"] = maxpower - res["period"] = period - res["transit"] = transit_time + def plot_phase_signals(self, ts, title_pfx="Phase-folded Light Curve: ", figsize=(11, 5)): + """Plots a phase-folded light curve from timeseries flux signal data. Requires a dataframe row containing the following columns: + "time_jd", "flux_norm", "time_bin_start", "flux_norm_binned", "tid", "sc", "period" + e.g., + df = SignalPlots.signal_phase_folder(file_list) + ts = df.iloc[index] + signal_plots.plot_phase_signals(ts) - # res['ts'] = ts + Parameters + ---------- + ts : ArrayLike + timeseries flux signal data + title_pfx : str, optional + Plot title prefix, by default "Phase-folded Light Curve: " + figsize : tuple, optional + figure size, by default (11,5) + """ + fig = plt.figure(figsize=figsize) + ax = fig.gca() + ax.plot(ts["time_jd"], ts["flux_norm"], "k.", markersize=1) + ax.plot( + ts["time_bin_start"], + ts["flux_norm_binned"], + "r-", + drawstyle="steps-post", + ) + ax.set_xlabel("Time (days)") + ax.set_ylabel("Normalized flux") + ax.set_title(title_pfx + ts["tid"]) + ax.legend([np.round(ts["period"], 3)]) + if self.save_png: + fpath = os.path.join(self.output_dir, f"{ts['sc']}-{ts['tid']}_phase_folded.png") + fig.savefig(fpath, dpi=300) + if self.show: + plt.show() + else: + plt.close() - # fold the time series using the period - ts_folded = ts.fold(period=period, epoch_time=transit_time) + def set_spec_kwargs(self, Fs=2, NFFT=256, noverlap=128, mode="psd", cmap="binary"): + """returns dict of default spectrogram kwargs + + Returns + ------- + dict + default spectrogram kwargs + """ + spec_kwargs = { + "Fs": Fs, + "NFFT": NFFT, + "noverlap": noverlap, + "mode": mode, + "cmap": cmap, + } + return spec_kwargs - # folded time series plot - # plt.plot(ts_folded.time.jd, ts_folded['sap_flux'], 'k.', markersize=1) - # plt.xlabel('Time (days)') - # plt.ylabel('SAP Flux (e-/s)') + def flux_specs( + self, + signal, + units=["Wavelength (λ)", "Frequency (ν)"], + colorbar=True, + save_for_ml=False, + fname="specgram", + title="Spectrogram", + **kwargs, + ): + """generate and save spectrograms of flux signal frequencies. By default uses kwargs in `set_spec_kwargs` method. - # normalize the flux by sigma-clipping the data to determine the baseline flux: - mean, median, stddev = sigma_clipped_stats(ts_folded["sap_flux"]) - ts_folded["sap_flux_norm"] = ts_folded["sap_flux"] / median - res["mean"] = mean - res["median"] = median - res["stddev"] = stddev - res["sap_flux_norm"] = ts_folded["sap_flux_norm"] + Parameters + ---------- + signal : ArrayLike + 1D array-like signal data + units : list of strings, optional + x and y units respectively, by default N["Wavelength (λ)", "Frequency (ν)"] + colorbar : bool, optional + include colorbar in plot, by default True + save_for_ml : bool, optional + plots pixel grid only (no axes, colorbar or labels), by default False + fname : str, optional + filename without extension for saving png, by default 'specgram' + title : str, optional + plot title, by default "Spectrogram" + **kwargs : dict + matplotlib.pyplot.specgram keyword arguments - # downsample the time series by binning the points into bins of equal time - ts_binned = aggregate_downsample(ts_folded, time_bin_size=0.03 * u.day) + Returns + ------- + tuple + periodogram, freqs, t, m - see matplotlib.pyplot.specgram + """ + fpath = os.path.join(self.output_dir, fname) + spec_kwargs = self.set_spec_kwargs(**kwargs) + if save_for_ml is True: + fig, ax = plt.subplots(figsize=(10, 10), frameon=False) + ax.axis(False) + else: + fig, ax = plt.subplots(figsize=(13, 11)) + if colorbar: + plt.colorbar() + units = ["Wavelength (λ)", "Frequency (ν)"] if units is None or len(units) < 2 else units + plt.xlabel(units[0]) + plt.ylabel(units[1]) + plt.title(title) - # final result - fig = plt.figure(figsize=(11, 5)) - ax = fig.gca() - ax.plot(ts_folded.time.jd, ts_folded["sap_flux_norm"], "k.", markersize=1) - ax.plot( - ts_binned.time_bin_start.jd, - ts_binned["sap_flux_norm"], - "r-", - drawstyle="steps-post", - ) - ax.set_xlabel("Time (days)") - ax.set_ylabel("Normalized flux") - ax.set_title(obs_id) - ax.legend([np.round(period, 3)]) + fig, freqs, t, m = plt.specgram( + signal, + **spec_kwargs, + ) + if self.save_png: + plt.savefig(fpath, dpi=300) + if self.show: + plt.show() + else: plt.close() + return fig, freqs, t, m - res["fig"] = fig - transits[index] = res +class K2SignalPlots(SignalPlots): + """Class for plotting K2 time series signals and their spectrograms.""" - df = pd.DataFrame.from_dict(transits, orient="index") + def __init__( + self, + flux_col="pdcsap_flux", + show=False, + save_png=True, + target_cns={1: "No Planet", 2: "Planet"}, + color_map={1: "red", 2: "blue"}, + **log_kws, + ): + """_summary_ + Parameters + ---------- + show : bool, optional + display plot, by default False + save_png : bool, optional + save plot as PNG file, by default True + target_cns : dict, optional + target label and string keypairs, by default {1: "No Planet", 2: "Planet"} + color_map : dict, optional + target label and color keypairs, by default {1: "red", 2: "blue"} + """ + super().__init__( + show=show, + save_png=save_png, + flux_col=flux_col, + target_cns=target_cns, + color_map=color_map, + name="K2SignalPlots", + **log_kws, + ) + self.df = None + self.files = [] + + def generate_dataframe(self): + """Generates dataframe of K2 light curve signal properties from list of FITS files""" + if len(self.files) == 0: + raise ValueError("No files provided. Set `self.files` to a list of K2 FITS light curve file paths.") + self.df = self.signal_phase_folder(self.files, fmt="kepler.fits", error=True, snr=True, include_extra=True) + + def generate_raw_flux_df(self, flux_col="SAP_FLUX", add_label=None, ffillna=True): + """Generates dataframe of raw flux signals from list of K2 FITS files""" + if len(self.files) == 0: + raise ValueError("No files provided. Set `self.files` to a list of K2 FITS light curve file paths.") + records = {} + for index, file in enumerate(self.files): + with fits.open(file) as hdulist: + signal = hdulist[1].data[flux_col] + records[index] = np.asarray(signal, dtype="float64") + df = pd.DataFrame.from_dict(records, orient="index") + if ffillna is True: + df.ffill(axis=1, inplace=True) + df.columns = ["FLUX." + str(c + 1) for c in df.columns] + if isinstance(add_label, int): + cols = list(df.columns) + df["LABEL"] = add_label + df = df[["LABEL"] + cols] return df + def generate_specs(self, ml_ready=False, rgb=True): + """Generates spectrograms for each light curve signal in dataframe""" + if self.df is None: + self.generate_dataframe(self.files) + if rgb is True: + kwargs = self.set_spec_kwargs(cmap="plasma") + for _, row in self.df.iterrows(): + fname = row["fname"].replace(".fits", "_specgram") + _, flux = self.read_ts_signal(row["fname"], fmt="kepler.fits", offset=True, remove_nans=True) + self.flux_specs( + flux, + save_for_ml=ml_ready, + fname=fname, + title=f"Spectrogram: {row['sc']}-{row['tid']}", + **kwargs, + ) + + def generate_phase_signal_plots(self): + """Generates phase-folded light curve plots for each signal in dataframe""" + if self.df is None: + self.generate_dataframe(self.files) + for i in list(range(len(self.df))): + ts = df.iloc[i] + self.plot_phase_signals(ts, title_pfx="K2 Phase-folded Light Curve: ", figsize=(11, 5)) + + def generate_flux_signal_plots(self): + """Generates atomic vector plots for each signal in dataframe""" + if self.df is None: + self.generate_dataframe(self.files) + for _, row in self.df.iterrows(): + fname = row["fname"].replace(".fits", "_flux_signal") + timestamps, flux = self.read_ts_signal(row["fname"], fmt="kepler.fits", offset=True, remove_nans=True) + self.atomic_vector_plotter( + flux, + timestamps=timestamps, + y_units="PDCSAP Flux (e-/s)", + x_units="Time (BJD)", + figsize=(15, 10), + fname=fname, + title_pfx=f"K2 Flux Signal: {row['sc']}-{row['tid']}", + ) + # testing if __name__ == "__main__": diff --git a/spacekit/dashboard/cal/index.py b/spacekit/dashboard/cal/index.py index e6614e26..ca757efd 100644 --- a/spacekit/dashboard/cal/index.py +++ b/spacekit/dashboard/cal/index.py @@ -304,6 +304,6 @@ def displayMouseEdgeData(data): args = parser.parse_args() host, port = args.host, args.port if args.env == "dev": - app.run_server(host=host, port=port, debug=True, dev_tools_prune_errors=False) + app.run(host=host, port=port, debug=True, dev_tools_prune_errors=False) else: - app.run_server(host="0.0.0.0", port=8050) + app.run(host="0.0.0.0", port=8050) diff --git a/spacekit/dashboard/cal/nodegraph.py b/spacekit/dashboard/cal/nodegraph.py index 0fcb0b95..bfb49c47 100644 --- a/spacekit/dashboard/cal/nodegraph.py +++ b/spacekit/dashboard/cal/nodegraph.py @@ -260,7 +260,7 @@ def get_coords(xy_origin, layer_idx): y0 = xy_origin[1] if layer_idx == 0: - neurons = clf.layers[layer_idx].output_shape[0][1] + neurons = clf.layers[layer_idx].output.shape[1] else: neurons = clf.layers[layer_idx].units slope = int(3200 / neurons) diff --git a/spacekit/dashboard/svm/config.py b/spacekit/dashboard/svm/config.py index 2627d4d1..e9efc3e2 100644 --- a/spacekit/dashboard/svm/config.py +++ b/spacekit/dashboard/svm/config.py @@ -1,10 +1,10 @@ from spacekit.builder.architect import Builder from spacekit.analyzer.explore import HstSvmPlots -from spacekit.analyzer.scan import SvmScanner +from spacekit.analyzer.scan import HstSvmScanner from spacekit.extractor.load import ImageIO # Find available datasets and load most recent (default) -svm = SvmScanner(perimeter="data/20??-*-*-*", primary=-1) +svm = HstSvmScanner(perimeter="data/20??-*-*-*", primary=-1) svm.scan_results() # Initialize EVAL svm.make_barplots() diff --git a/spacekit/dashboard/svm/eda.py b/spacekit/dashboard/svm/eda.py index bafe23e6..249dd8bf 100644 --- a/spacekit/dashboard/svm/eda.py +++ b/spacekit/dashboard/svm/eda.py @@ -134,23 +134,23 @@ html.Div( children=[ dcc.Graph( - id="hrc-scatter", + id="acs-hrc-scatter", style={"display": "inline-block", "float": "center"}, ), dcc.Graph( - id="ir-scatter", + id="wfc3-ir-scatter", style={"display": "inline-block", "float": "center"}, ), dcc.Graph( - id="sbc-scatter", + id="acs-sbc-scatter", style={"display": "inline-block", "float": "center"}, ), dcc.Graph( - id="uvis-scatter", + id="wfc3-uvis-scatter", style={"display": "inline-block", "float": "center"}, ), dcc.Graph( - id="wfc-scatter", + id="acs-wfc-scatter", style={"display": "inline-block", "float": "center"}, ), ], diff --git a/spacekit/dashboard/svm/index.py b/spacekit/dashboard/svm/index.py index 4ae6d4d8..ca948e3b 100644 --- a/spacekit/dashboard/svm/index.py +++ b/spacekit/dashboard/svm/index.py @@ -2,7 +2,7 @@ from dash.dependencies import Input, Output, State from dash.exceptions import PreventUpdate from argparse import ArgumentParser -from app import app +from spacekit.dashboard.svm.app import app from spacekit.dashboard.svm import eda, eval, pred from spacekit.dashboard.svm.config import svm, hst, images # NN import numpy as np @@ -100,11 +100,11 @@ def update_cmx(cmx_type): # SCATTER CALLBACK @app.callback( [ - Output("hrc-scatter", "figure"), - Output("ir-scatter", "figure"), - Output("sbc-scatter", "figure"), - Output("uvis-scatter", "figure"), - Output("wfc-scatter", "figure"), + Output("acs-hrc-scatter", "figure"), + Output("wfc3-ir-scatter", "figure"), + Output("acs-sbc-scatter", "figure"), + Output("wfc3-uvis-scatter", "figure"), + Output("acs-wfc-scatter", "figure"), ], [Input("selected-scatter", "value")], ) @@ -225,6 +225,6 @@ def preview_augmented_image(n_clicks, selected_image): args = parser.parse_args() host, port = args.host, args.port if args.env == "dev": - app.run_server(host=host, port=port, debug=True, dev_tools_prune_errors=False) + app.run(host=host, port=port, debug=True, dev_tools_prune_errors=False) else: - app.run_server(host="0.0.0.0", port=8050) + app.run(host="0.0.0.0", port=8050) diff --git a/spacekit/datasets/beam.py b/spacekit/datasets/beam.py index ee4a1b00..85c5154d 100644 --- a/spacekit/datasets/beam.py +++ b/spacekit/datasets/beam.py @@ -100,7 +100,7 @@ def download(scrape="file:data", datasets="2022-02-14,2021-11-04,2021-10-28", de help="Uses a key:uri format where options for the key are limited to web, s3, or file. \ The uri could be your own custom location if not using the default datasets. \ Examples are web:calcloud, web:custom.json, s3:mybucket, file:myfolder. \ - Visit spacekit.readthedocs.io for more info.", + Visit www.spacekit.org for more info.", ) parser.add_argument( "-d", diff --git a/spacekit/extractor/radio.py b/spacekit/extractor/radio.py index c6bfb2b9..46863c03 100644 --- a/spacekit/extractor/radio.py +++ b/spacekit/extractor/radio.py @@ -2,7 +2,6 @@ import shutil import glob import re -import boto3 import numpy as np import pandas as pd from spacekit.logger.log import Logger @@ -12,6 +11,11 @@ except ImportError: Observations = None +try: + import boto3 +except ImportError: + boto3 = None + try: from progressbar import ProgressBar except ImportError: @@ -22,6 +26,10 @@ def check_astroquery(): return Observations is not None +def check_boto3(): + return boto3 is not None + + def check_progressbar(): return ProgressBar is not None @@ -54,9 +62,9 @@ def __init__(self, config="disable", name="Radio", **log_kws): self.log = Logger(self.__name__, **log_kws).spacekit_logger() self.config = config self.region = "us-east-1" - self.s3 = boto3.resource("s3", region_name=self.region) - self.bucket = self.s3.Bucket("stpubdata") - self.location = {"LocationConstraint": self.region} + self.s3 = None + self.bucket = None + self.location = None self.target_list = None self.proposal_id = None # '13926' self.collection = None # "K2" "HST" "HLA" "JWST" @@ -73,10 +81,9 @@ def __init__(self, config="disable", name="Radio", **log_kws): raise ImportError( "You must install astroquery (`pip install astroquery`) " "and progressbar (`pip install progressbar`) for the " - "radio module to work. \n\nInstall extra deps via " + "radio module to work. \n\nInstall all extra deps via " "`pip install spacekit[x]`" ) - self.configure_aws() def configure_aws(self): @@ -84,8 +91,19 @@ def configure_aws(self): # configure aws settings if self.config == "enable": self.log.info("Configuring for AWS cloud data retrieval...") + if not check_boto3(): + self.log.error("boto3 not installed.") + raise ImportError( + "You must install boto3 (`pip install boto3`) for " + "AWS S3 access via the radio module.\n\nInstall all " + "extra deps via `pip install spacekit[x]`" + ) + self.s3 = boto3.resource("s3", region_name=self.region) + self.bucket = self.s3.Bucket("stpubdata") + self.location = {"LocationConstraint": self.region} Observations.enable_cloud_dataset(provider="AWS", profile="default") - elif self.config == "disable": + else: + self.log.info("Configuring for MAST data retrieval only...") Observations.disable_cloud_dataset() def set_query_params(self, **kwargs): @@ -187,7 +205,6 @@ class object with attributes updated self.log.error(f"Could not resolve {target} to a sky position.") self.errors.append(target) continue - return self def s3_download(self): """Download datasets in list of uris from AWS s3 bucket (public access via STScI) @@ -210,7 +227,6 @@ class object with attributes updated except FileExistsError: continue self.log.info(f"Download Complete: {count} files") - return self def mast_download(self): """Download datasets from MAST""" diff --git a/spacekit/preprocessor/scrub.py b/spacekit/preprocessor/scrub.py index 5a5bb54f..6d45075a 100644 --- a/spacekit/preprocessor/scrub.py +++ b/spacekit/preprocessor/scrub.py @@ -1109,6 +1109,9 @@ def continuous_nandler(self): if self.verbose: print(f"\nNaNs to be NaNdled:\n{self.df[cols].isna().sum()}\n") for n in cols: + if self.df[n].dtype in ['str', 'O']: # pandas>=3.0 uses 'str' for object types + vals = pd.to_numeric(self.df[n], errors='coerce') + self.df[n] = vals self.df.loc[self.df[n].isna(), n] = 0.0 def discrete_nandler(self, nanval=0.0): diff --git a/spacekit/preprocessor/transform.py b/spacekit/preprocessor/transform.py index 13c26699..d9fc52df 100644 --- a/spacekit/preprocessor/transform.py +++ b/spacekit/preprocessor/transform.py @@ -888,16 +888,29 @@ def tensors_to_arrays(X_train, y_train, X_test, y_test): return X_train, y_train, X_test, y_test -def hypersonic_pliers(path_to_train, path_to_test, y_col=[0], skip=1, dlm=",", encoding="bytes", subtract_y=0.0, reshape=False): - """Extracts data into 1-dimensional arrays, using separate target classes (y) for training and test data. Assumes y (target) - is first column in dataframe. If the target (y) classes in the raw data are 0 and 2, but you'd like them to be binaries (0 - and 1), set subtract_y=1.0 +def ffill_array(arr): + """ + Forward-fills NaN values in a 1D NumPy array using pure NumPy operations. + """ + mask = np.isnan(arr) + # Get indices of non-NaN values + idx = np.where(~mask, np.arange(len(mask)), 0) + # Forward fill indices + np.maximum.accumulate(idx, out=idx) + # Use the indices to select the corresponding non-NaN values + return arr[idx] + + +def hypersonic_pliers( + path_to_train, path_to_test=None, y_col=[0], skip=1, dlm=",", encoding="bytes", subtract_y=0.0, reshape=False +): + """Extracts data into 1-dimensional arrays, using separate target classes (y) for training and test data. Assumes y (target) is first column in dataframe. If the target (y) classes in the raw data are 1 and 2, but you'd like them to be binaries (0 and 1), set subtract_y=1.0 Parameters ---------- path_to_train : string path to training data file (csv) - path_to_test : string + path_to_test : string, optional path to test data file (csv) y_col : list, optional axis index of target class, by default [0] @@ -920,20 +933,23 @@ def hypersonic_pliers(path_to_train, path_to_test, y_col=[0], skip=1, dlm=",", e xcols = [c for c in cols if c not in y_col] X_train = Train[:, xcols] y_train = Train[:, y_col, np.newaxis] - subtract_y + if reshape is True: + y_train = y_train.reshape(y_train.shape[0], 1) + del Train + print("X_train: ", X_train.shape) + print("y_train: ", y_train.shape) + + if path_to_test is None: + return X_train, y_train Test = np.loadtxt(path_to_test, skiprows=skip, delimiter=dlm, encoding=encoding) X_test = Test[:, xcols] y_test = Test[:, y_col, np.newaxis] - subtract_y if reshape is True: - y_train = y_train.reshape(y_train.shape[0], 1) y_test = y_test.reshape(y_test.shape[0], 1) - - del Train, Test - print("X_train: ", X_train.shape) - print("y_train: ", y_train.shape) + del Test print("X_test: ", X_test.shape) print("y_test: ", y_test.shape) - return X_train, X_test, y_train, y_test @@ -969,7 +985,7 @@ def thermo_fusion_chisel(matrix1, matrix2=None): return matrix1 -def babel_fish_dispenser(matrix1, matrix2=None, step_size=None, axis=2): +def babel_fish_dispenser(matrix1, matrix2=None, step_size=200, axis=2): """Adds an input corresponding to the running average over a set number of time steps. This helps the neural network to ignore high frequency noise by passing in a uniform 1-D filter and stacking the arrays. @@ -989,9 +1005,6 @@ def babel_fish_dispenser(matrix1, matrix2=None, step_size=None, axis=2): numpy array(s) 2D array (original input array with a uniform 1d-filter as noise) """ - if step_size is None: - step_size = 200 - # calc input for flux signal rolling avgs filter1 = uniform_filter1d(matrix1, axis=1, size=step_size) # store in array and stack on 2nd axis for each obs of X data @@ -1008,8 +1021,7 @@ def babel_fish_dispenser(matrix1, matrix2=None, step_size=None, axis=2): def fast_fourier(matrix, bins): - """Takes an array (e.g. signal input values) and rotates number of ``bins`` to the left as a fast Fourier transform. Returns - vector of length equal to ``matrix`` input array. + """Takes an array (e.g. signal input values) and rotates number of ``bins`` to the left as a fast Fourier transform. Returns vector of length equal to ``matrix`` input array. Parameters ---------- diff --git a/spacekit/skopes/kepler/light_curves.py b/spacekit/skopes/kepler/light_curves.py index 81dcbea4..f3c60d30 100644 --- a/spacekit/skopes/kepler/light_curves.py +++ b/spacekit/skopes/kepler/light_curves.py @@ -44,17 +44,19 @@ def split_data(self): train = fpath else: test = fpath - self.X_train, self.X_test, self.y_train, self.y_test = hypersonic_pliers(train, test, subtract_y=1.0, reshape=True) + self.X_train, self.X_test, self.y_train, self.y_test = hypersonic_pliers( + train, path_to_test=test, subtract_y=1.0, reshape=True + ) print("Data split successful") def scale_data(self): print("Scaling data to Zero Mean and Unit Variance...") - self.X_train, self.X_test = thermo_fusion_chisel(self.X_train, self.X_test) + self.X_train, self.X_test = thermo_fusion_chisel(self.X_train, matrix2=self.X_test) print("Data scaling successful.") def add_filter(self): print("Adding noise filter...") - self.X_train, self.X_test = babel_fish_dispenser(self.X_train, self.X_test) + self.X_train, self.X_test = babel_fish_dispenser(self.X_train, matrix2=self.X_test) print("Noise filter added successfully.") def deploy(self): diff --git a/tests/preprocessor/test_encode.py b/tests/preprocessor/test_encode.py index 95aaa3f6..a10b9f1e 100644 --- a/tests/preprocessor/test_encode.py +++ b/tests/preprocessor/test_encode.py @@ -34,13 +34,10 @@ def test_svm_encoder(scraped_mast_file): enc.encode_features() assert enc.df.shape == (1, 18) for col in ENCODED_COL_EXPECTED: - if col in list(enc.df.columns): - assert True - else: - assert False - assert enc.df.cat[0] == 3 - assert enc.df.det[0] == 1 - assert enc.df.wcs[0] == 0 + assert col in list(enc.df.columns) + assert enc.df.iloc[0]['cat'] == 3 + assert enc.df.iloc[0]['det'] == 1 + assert enc.df.iloc[0]['wcs'] == 0 @mark.svm diff --git a/tests/pytest.ini b/tests/pytest.ini index ff24e993..6143b567 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -3,25 +3,25 @@ python_files = test_* python_classes = *Tests python_functions = test_* markers = - jwst - hst + analyzer + architect + builder cal - svm: marks tests as svm (deselect with '-m "not svm"') - prep - predict - train - generator + compute draw - preprocessor - scrub - transform encode + explore extractor - scrape + generator + hst + jwst + predict + prep + preprocessor radio - analyzer scan - explore - compute - architect - builder \ No newline at end of file + scrape + scrub + svm: marks tests as svm (deselect with '-m "not svm"') + train + transform \ No newline at end of file diff --git a/tox.ini b/tox.ini index c229d522..fb8cd72a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,7 @@ [tox] isolated_build = True envlist = - py3{10,11,12} + py3{10,11,12,13} check-{style,security} pep517 #test{,-oldestdeps,-devdeps} @@ -113,25 +113,25 @@ python_classes = *Tests python_functions = test_* testpaths = tests markers = - jwst - hst + analyzer + architect + builder cal - svm: marks tests as svm (deselect with '-m "not svm"') - predict - train - prep - generator + compute draw - preprocessor - scrub encode - transform + explore extractor - scrape + generator + hst + jwst + predict + prep + preprocessor radio - analyzer scan - explore - compute - architect - builder + scrape + scrub + svm: marks tests as svm (deselect with '-m "not svm"') + train + transform \ No newline at end of file