From 1c793fc383b88573f832b073d9484dff6aae606f Mon Sep 17 00:00:00 2001 From: Leo Wilkinson Date: Fri, 13 Mar 2026 13:32:45 +0000 Subject: [PATCH 1/5] Add VastDB-native RNA maps script New standalone script that accepts VastDB EVENT ID lists with pre-assigned categories (enhanced, silenced, control, constitutive) instead of rMATS output, parsing coordinates directly from a VastDB EVENT_INFO annotation file. Co-Authored-By: Claude Sonnet 4.6 --- rna_maps_vastdb_only.py | 785 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 785 insertions(+) create mode 100644 rna_maps_vastdb_only.py diff --git a/rna_maps_vastdb_only.py b/rna_maps_vastdb_only.py new file mode 100644 index 0000000..aead0d0 --- /dev/null +++ b/rna_maps_vastdb_only.py @@ -0,0 +1,785 @@ +#!/usr/bin/env python3 +""" +RNA Maps from VastDB ID Lists + +Generates RNA maps showing RBP binding patterns around categorized exons. +Uses VastDB EVENT IDs with pre-assigned categories (enhanced, silenced, control, constitutive). + +Key features: +- 6 splice site regions (upstream_3ss, upstream_5ss, middle_3ss, middle_5ss, downstream_3ss, downstream_5ss) +- Coverage plots with statistical enrichment +- Heatmaps +- No rMATS dependency +""" + +import matplotlib +import matplotlib.ticker as mticker +from matplotlib.colors import LinearSegmentedColormap +from matplotlib.gridspec import GridSpec +from matplotlib import colormaps +matplotlib.use('Agg') +matplotlib.rcParams['pdf.fonttype'] = 42 +matplotlib.rcParams['ps.fonttype'] = 42 + +from scipy.ndimage import gaussian_filter1d +from matplotlib.lines import Line2D +import pandas as pd +import numpy as np +import pybedtools as pbt +import seaborn as sns +import matplotlib.pyplot as plt +import scipy.stats as stats +import sys +import os +import argparse +import random +import string +import logging +import datetime +import time +import re + +# =============================================================================== +# CONFIGURATION +# =============================================================================== + +sns.set_style("whitegrid", {'legend.frameon':True}) +colors_dict = { + 'all': '#D3D3D3', + 'ctrl': '#408F76', + 'enh': '#F30C08', + 'sil': '#005299', + 'enhrest': '#FFB122', + 'silrest': '#6DC2F5', + 'const': '#666666' +} +linewidth = 3 + +# =============================================================================== +# UTILITY FUNCTIONS +# =============================================================================== + +def setup_logging(output_path): + """Sets up logging to file and console.""" + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + log_filename = f"execution_{timestamp}.log" + log_path = os.path.join(output_path, log_filename) + + logger = logging.getLogger() + logger.handlers.clear() + logger.setLevel(logging.INFO) + + file_handler = logging.FileHandler(log_path, mode='w') + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + + logger.addHandler(file_handler) + logger.addHandler(stream_handler) + + file_handler.flush = file_handler.stream.flush + + start_time = time.time() + + logging.getLogger('matplotlib').setLevel(logging.WARNING) + logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) + logging.getLogger('matplotlib.backends').setLevel(logging.ERROR) + logging.getLogger('fontTools').setLevel(logging.ERROR) + logging.getLogger('fontTools.subset').setLevel(logging.ERROR) + + logger.info(f"Starting script execution at {timestamp}") + + return log_filename, start_time, logger + + +def log_runtime(start_time, logger): + """Logs the script execution time.""" + end_time = time.time() + total_seconds = int(end_time - start_time) + minutes, seconds = divmod(total_seconds, 60) + runtime_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s" + logger.info(f"Script completed in {runtime_str}.") + for handler in logger.handlers: + handler.flush() + + +def smooth_coverage(df, window_size=10, std=2): + """Smooth coverage data using rolling Gaussian window.""" + result = df.copy() + groups = [] + + for (exon_id, label), group_df in result.groupby(['exon_id', 'label']): + if len(group_df) < 3: + groups.append(group_df) + continue + + group_sorted = group_df.sort_values('position') + + if len(group_sorted) >= window_size: + values = group_sorted['coverage'].values + s = pd.Series(values) + + smoothed = s.rolling( + window=window_size, + center=True, + win_type='gaussian' + ).mean(std=std) + + smoothed = smoothed.fillna(s) + group_sorted['coverage'] = smoothed.values + + groups.append(group_sorted) + + return pd.concat(groups) + + +# =============================================================================== +# VASTDB PARSING FUNCTIONS +# =============================================================================== + +def parse_event_info_coordinates(event_info_df): + """ + Parse genomic coordinates from EVENT_INFO file. + Returns DataFrame with coordinates and flanking exon info. + """ + coords_list = [] + + for idx, row in event_info_df.iterrows(): + # Parse main exon coordinates from COORD_o + coord_str = str(row['COORD_o']) + match = re.match(r'(chr[\w]+):(\d+)-(\d+)', coord_str) + if not match: + continue + + # Extract strand from REF_CO + ref_co = str(row.get('REF_CO', '')) + strand_match = re.search(r':([+-])$', ref_co) + strand = strand_match.group(1) if strand_match else '+' + + # Parse flanking exon coordinates from CO_C1 and CO_C2 + # CO_C1 format: "chr:start-end" (upstream constitutive exon) + # CO_C2 format: "chr:start-end" (downstream constitutive exon) + upstream_start, upstream_end = np.nan, np.nan + downstream_start, downstream_end = np.nan, np.nan + + if pd.notna(row.get('CO_C1', np.nan)): + co_c1_match = re.match(r'chr[\w]+:(\d+)-(\d+)', str(row['CO_C1'])) + if co_c1_match: + upstream_start = int(co_c1_match.group(1)) + upstream_end = int(co_c1_match.group(2)) + + if pd.notna(row.get('CO_C2', np.nan)): + co_c2_match = re.match(r'chr[\w]+:(\d+)-(\d+)', str(row['CO_C2'])) + if co_c2_match: + downstream_start = int(co_c2_match.group(1)) + downstream_end = int(co_c2_match.group(2)) + + coords_list.append({ + 'EVENT': row['EVENT'], + 'GENE': row['GENE'], + 'chr': match.group(1), + 'exonStart_1based': int(match.group(2)), + 'exonEnd': int(match.group(3)), + 'strand': strand, + 'upstreamES': upstream_start, + 'upstreamEE': upstream_end, + 'downstreamES': downstream_start, + 'downstreamEE': downstream_end, + }) + + coords_df = pd.DataFrame(coords_list) + + if len(coords_df) > 0: + logging.info(f"Parsed coordinates for {len(coords_df)} events") + + return coords_df + + +def load_vastdb_categories(enhanced_file, silenced_file, control_file, + constitutive_file, event_info_file): + """ + Load VastDB ID lists and look up coordinates. + Returns DataFrame ready for RNA map generation. + """ + + logging.info("="*60) + logging.info("LOADING VASTDB ID LISTS") + logging.info("="*60) + + # Read ID lists + def read_id_list(filepath, category): + if filepath is None: + return [] + with open(filepath) as f: + ids = [line.strip() for line in f + if line.strip() and not line.startswith('#')] + logging.info(f"Loaded {len(ids)} {category} IDs") + return ids + + enhanced_ids = read_id_list(enhanced_file, 'enhanced') + silenced_ids = read_id_list(silenced_file, 'silenced') + control_ids = read_id_list(control_file, 'control') + constitutive_ids = read_id_list(constitutive_file, 'constitutive') + + all_ids = enhanced_ids + silenced_ids + control_ids + constitutive_ids + + if len(all_ids) == 0: + raise ValueError("No EVENT IDs provided in any file!") + + logging.info(f"Total EVENT IDs: {len(all_ids)}") + + # Create ID to category mapping + id_to_category = {} + for eid in enhanced_ids: + id_to_category[eid] = 'enhanced' + for eid in silenced_ids: + id_to_category[eid] = 'silenced' + for eid in control_ids: + id_to_category[eid] = 'control' + for eid in constitutive_ids: + id_to_category[eid] = 'constituitive' # Match original script spelling + + # Load EVENT_INFO + logging.info(f"Loading EVENT_INFO: {event_info_file}") + event_info = pd.read_csv(event_info_file, sep='\t') + logging.info(f"EVENT_INFO contains {len(event_info)} events") + + # Parse coordinates + event_coords = parse_event_info_coordinates(event_info) + + # Filter to our IDs + event_coords_subset = event_coords[event_coords['EVENT'].isin(all_ids)].copy() + + n_matched = len(event_coords_subset) + n_total = len(all_ids) + logging.info(f"Matched {n_matched}/{n_total} IDs to coordinates ({100*n_matched/n_total:.1f}%)") + + if n_matched == 0: + raise ValueError("No EVENT IDs matched to coordinates!") + + # Adjust coordinates from 1-based to 0-based + event_coords_subset['exonStart_0base'] = event_coords_subset['exonStart_1based'] - 1 + + # Also adjust flanking exon coordinates from 1-based to 0-based + event_coords_subset['upstreamES'] = event_coords_subset['upstreamES'] - 1 + event_coords_subset['downstreamES'] = event_coords_subset['downstreamES'] - 1 + + # Assign categories + event_coords_subset['category'] = event_coords_subset['EVENT'].map(id_to_category) + + # Add fake FDR column (not used but needed for compatibility) + event_coords_subset['FDR'] = 0.001 + + # Create exon_id for tracking + event_coords_subset['exon_id'] = ( + event_coords_subset['category'] + "_" + + event_coords_subset['chr'].astype(str) + ":" + + event_coords_subset['exonStart_0base'].astype(str) + "-" + + event_coords_subset['exonEnd'].astype(str) + ";" + + event_coords_subset['strand'].astype(str) + ) + + logging.info("\nCategory distribution:") + logging.info(event_coords_subset['category'].value_counts()) + + return event_coords_subset + + +# =============================================================================== +# RNA MAP GENERATION FUNCTIONS (from original script) +# =============================================================================== + +def get_ss_bed(df, pos_col, neg_col): + """Create BED file for splice sites (handles strand orientation).""" + # Strip "chr" prefix to match CLIP file format + df = df.copy() + + ss_pos = df.loc[df['strand'] == "+", ['chr', pos_col, pos_col, 'exon_id', 'FDR', 'strand']] + ss_pos.columns = ['chr', 'start', 'end', 'name', 'score', 'strand'] + ss_pos.start = ss_pos.start.transform(lambda x: x-1) + + ss_n = df.loc[df['strand'] == "-", ['chr', neg_col, neg_col, 'exon_id', 'FDR', 'strand']] + ss_n.columns = ['chr', 'start', 'end', 'name', 'score', 'strand'] + ss_n.end = ss_n.end.transform(lambda x: x+1) + + ss = pd.concat([ss_pos, ss_n]) + + return ss + + +def get_coverage_plot(xl_bed, df, fai, window, exon_categories, label, smoothing=15): + """Calculate coverage of CLIP around splice sites with statistical enrichment.""" + df = df.loc[df.name != "."] + xl_bed = pbt.BedTool(xl_bed).sort() + pbt_df = pbt.BedTool.from_dataframe( + df[['chr', 'start', 'end', 'name', 'score', 'strand']] + ).sort().slop(l=window, r=window, s=True, g=fai) + + df_coverage = pbt_df.coverage( + b=xl_bed, **{'sorted': True, 's': True, 'd': True, 'nonamecheck': True} + ).to_dataframe()[['thickStart', 'thickEnd', 'strand', 'name']] + + df_coverage.rename(columns=({'thickStart':'position','thickEnd':'coverage'}), inplace=True) + + df_plot = df_coverage + + # Adjust positions based on strand + df_plot.loc[df_plot.strand=='+', 'position'] = df_plot['position'].astype('int32') + df_plot.loc[df_plot.strand=='-', 'position'] = abs(2 * window + 2 - df_plot['position']) + + df_plot = df_plot.loc[df_plot.name != "."] + + df_plot = df_plot.join( + df_plot.pop('name').str.split('_',n=1,expand=True).rename(columns={0:'name', 1:'exon_id'}) + ) + + heatmap_plot = df_plot.copy() + heatmap_plot['label'] = label + + # Aggregate coverage + df_plot = df_plot.groupby(['name','position'], as_index=False).agg({'coverage':'sum'}) + + exon_cat = pd.DataFrame({'name':exon_categories.index, 'number_exons':exon_categories.values}) + df_plot = df_plot.merge(exon_cat, how="left") + + df_plot['norm_coverage'] = np.where( + df_plot['coverage'] == 0, + df_plot['coverage'], + df_plot['coverage']/df_plot['number_exons'] + ) + + # Fisher test against control + df_plot_ctrl = df_plot.loc[df_plot.name == "control"][["position","coverage","number_exons"]] + df_plot_ctrl.columns = ["position","control_coverage","control_number_exons"] + df_plot = df_plot.merge(df_plot_ctrl, how="left") + df_plot['control_norm_coverage'] = df_plot["control_coverage"] / df_plot["control_number_exons"] + + # Pseudo-count for zero values + df_plot.loc[df_plot['control_norm_coverage'] == 0, ['control_norm_coverage']] = 0.000001 + df_plot['fold_change'] = df_plot["norm_coverage"] / df_plot["control_norm_coverage"] + + # Statistical test + contingency_table = list(zip( + df_plot['coverage'], + df_plot['number_exons']-df_plot['coverage'], + df_plot['control_coverage'], + df_plot['control_number_exons'] - df_plot['control_coverage'] + )) + contingency_table = [np.array(table).reshape(2,2) for table in contingency_table] + df_plot['pvalue'] = [stats.fisher_exact(table)[1] for table in contingency_table] + + df_plot['-log10pvalue'] = np.log10(1/df_plot['pvalue']) + df_plot['label'] = label + + df_plot.loc[df_plot['fold_change'] < 1, ['-log10pvalue']] = df_plot['-log10pvalue'] * -1 + df_plot['-log10pvalue_smoothed'] = df_plot['-log10pvalue'].rolling( + smoothing, center=True, win_type="gaussian" + ).mean(std=2) + + return df_plot, heatmap_plot + + +def set_legend_text(legend, exon_categories, original_counts=None): + """Set legend text with optional subset information.""" + exon_cat = pd.DataFrame({'name': exon_categories.index, 'number_exons': exon_categories.values}) + categories = exon_cat['name'].unique() + legend.set_bbox_to_anchor([1.08, 0.75]) + legend.set_title("") + + legend_idx = 0 + + for category in ['constituitive', 'control', 'enhanced', 'silenced']: + if category in categories: + count = exon_cat[exon_cat['name'] == category]['number_exons'].values[0] + text = f"{category.capitalize()} ({count}" + + if original_counts and category in original_counts: + orig_count = original_counts[category] + if orig_count > count: + text += f", subset from {orig_count}" + + text += ")" + legend.texts[legend_idx].set_text(text) + legend_idx += 1 + + +def add_enrichment_marker(fig, ax): + """Add enrichment/depletion marker with arrows.""" + y_min, y_max = ax.get_ylim() + y_zero_position = 0 + marker_height = 0.4 + normalized_bottom = (y_zero_position - y_min) / (y_max - y_min) - marker_height / 2 + + marker_ax = fig.add_axes([0.94, normalized_bottom, 0.06, marker_height], + label=f"enrichment_marker_{ax.get_title()}") + + marker_ax.set_xticks([]) + marker_ax.set_yticks([]) + for spine in marker_ax.spines.values(): + spine.set_visible(False) + + marker_ax.text(0.5, 0.5, "↑\nenriched\n\n\n↓\ndepleted", + ha='center', va='center', fontsize=11, + transform=marker_ax.transAxes) + + return marker_ax + + +# =============================================================================== +# MAIN RNA MAP FUNCTION +# =============================================================================== + +def run_rna_maps(enhanced_file, silenced_file, control_file, constitutive_file, + event_info_file, xl_bed, fai, genome_fasta, window, smoothing, + output_dir, no_constitutive, no_subset, all_sites, prefix): + """ + Main function to generate RNA maps from VastDB ID lists. + """ + + log_filename, start_time, logger = setup_logging(output_dir) + + FILEname = prefix + + # Load chromosome list + df_fai = pd.read_csv(fai, sep='\t', header=None) + chroms = set(df_fai[0].values) + + # Load VastDB categories and coordinates + df_rmats = load_vastdb_categories( + enhanced_file, silenced_file, control_file, + constitutive_file, event_info_file + ) + + # Filter to valid chromosomes + df_rmats = df_rmats[df_rmats['chr'].isin(chroms)] + + # Remove exons with missing flanking coordinates + logging.info("\nFiltering exons with complete flanking coordinates...") + before = len(df_rmats) + df_rmats = df_rmats.dropna(subset=['upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE']) + after = len(df_rmats) + logging.info(f"Removed {before - after} exons with missing flanking coordinates") + logging.info(f"Remaining: {after} exons") + + exon_categories = df_rmats.groupby('category').size() + logging.info("\nExons in each category:") + logging.info(exon_categories) + + # Check for required categories + if "control" not in exon_categories or exon_categories.loc["control"] == 0: + raise ValueError("No control exons found!") + + if "enhanced" not in exon_categories and "silenced" not in exon_categories: + raise ValueError("No regulated exons found!") + + # Apply subsetting + if not no_subset: + category_counts = df_rmats['category'].value_counts() + original_counts = {cat: count for cat, count in category_counts.items()} + + np.random.seed(42) + logging.info("Random seed set to 42 for reproducible subsetting") + + target_count = 0 + if 'enhanced' in category_counts and 'silenced' in category_counts: + target_count = max(category_counts['enhanced'], category_counts['silenced']) + elif 'enhanced' in category_counts: + target_count = category_counts['enhanced'] + elif 'silenced' in category_counts: + target_count = category_counts['silenced'] + + # Subset control + if 'control' in category_counts and category_counts['control'] > target_count > 0: + control_indices = df_rmats[df_rmats['category'] == 'control'].index + control_indices_to_keep = np.random.choice(control_indices, target_count, replace=False) + drop_mask = df_rmats.index.isin(control_indices) & ~df_rmats.index.isin(control_indices_to_keep) + df_rmats = df_rmats[~drop_mask] + logging.info(f"Subsetted control exons from {category_counts['control']} to {target_count}") + + # Subset constitutive + if not no_constitutive and 'constituitive' in category_counts and category_counts['constituitive'] > target_count > 0: + const_indices = df_rmats[df_rmats['category'] == 'constituitive'].index + const_indices_to_keep = np.random.choice(const_indices, target_count, replace=False) + drop_mask = df_rmats.index.isin(const_indices) & ~df_rmats.index.isin(const_indices_to_keep) + df_rmats = df_rmats[~drop_mask] + logging.info(f"Subsetted constitutive exons from {category_counts['constituitive']} to {target_count}") + else: + logging.info("Subsetting disabled") + category_counts = df_rmats['category'].value_counts() + original_counts = {cat: count for cat, count in category_counts.items()} + + exon_categories = df_rmats.groupby('category').size() + + # Save categorized exons + output_file = os.path.join(output_dir, f"{FILEname}_VastDB_with_categories.tsv") + df_rmats[['chr', 'exonStart_0base', 'exonEnd', 'strand', 'category', 'EVENT', 'GENE', + 'upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE']].to_csv( + output_file, sep='\t', index=False + ) + logging.info(f"Saved categorized exons to {output_file}") + + # Create BED files for 6 regions + logging.info("\n" + "="*60) + logging.info("CREATING BED FILES FOR SPLICE SITES") + logging.info("="*60) + + middle_3ss_bed = get_ss_bed(df_rmats, 'exonStart_0base', 'exonEnd') + middle_5ss_bed = get_ss_bed(df_rmats, 'exonEnd', 'exonStart_0base') + downstream_3ss_bed = get_ss_bed(df_rmats, 'downstreamES', 'downstreamEE') + upstream_5ss_bed = get_ss_bed(df_rmats, 'upstreamEE', 'upstreamES') + + if all_sites: + downstream_5ss_bed = get_ss_bed(df_rmats, 'downstreamEE', 'downstreamES') + upstream_3ss_bed = get_ss_bed(df_rmats, 'upstreamES', 'upstreamEE') + + # Calculate coverage + if xl_bed is not None: + logging.info("\n" + "="*60) + logging.info("CALCULATING COVERAGE") + logging.info("="*60) + + middle_3ss = get_coverage_plot(xl_bed, middle_3ss_bed, fai, window, exon_categories, 'middle_3ss', smoothing) + middle_5ss = get_coverage_plot(xl_bed, middle_5ss_bed, fai, window, exon_categories, 'middle_5ss', smoothing) + downstream_3ss = get_coverage_plot(xl_bed, downstream_3ss_bed, fai, window, exon_categories, 'downstream_3ss', smoothing) + upstream_5ss = get_coverage_plot(xl_bed, upstream_5ss_bed, fai, window, exon_categories, 'upstream_5ss', smoothing) + + linegraph_middle_3ss = middle_3ss[0] + linegraph_middle_5ss = middle_5ss[0] + linegraph_downstream_3ss = downstream_3ss[0] + linegraph_upstream_5ss = upstream_5ss[0] + + heatmap_middle_3ss = middle_3ss[1] + heatmap_middle_5ss = middle_5ss[1] + heatmap_downstream_3ss = downstream_3ss[1] + heatmap_upstream_5ss = upstream_5ss[1] + + if not all_sites: + plotting_df = pd.concat([linegraph_upstream_5ss, linegraph_middle_3ss, + linegraph_middle_5ss, linegraph_downstream_3ss]) + heat_df = pd.concat([heatmap_upstream_5ss, heatmap_middle_3ss, + heatmap_middle_5ss, heatmap_downstream_3ss]) + else: + downstream_5ss = get_coverage_plot(xl_bed, downstream_5ss_bed, fai, window, exon_categories, 'downstream_5ss', smoothing) + upstream_3ss = get_coverage_plot(xl_bed, upstream_3ss_bed, fai, window, exon_categories, 'upstream_3ss', smoothing) + + linegraph_downstream_5ss = downstream_5ss[0] + linegraph_upstream_3ss = upstream_3ss[0] + heatmap_downstream_5ss = downstream_5ss[1] + heatmap_upstream_3ss = upstream_3ss[1] + + plotting_df = pd.concat([linegraph_middle_3ss, linegraph_middle_5ss, + linegraph_downstream_3ss, linegraph_downstream_5ss, + linegraph_upstream_3ss, linegraph_upstream_5ss]) + heat_df = pd.concat([heatmap_middle_3ss, heatmap_middle_5ss, + heatmap_downstream_3ss, heatmap_downstream_5ss, + heatmap_upstream_3ss, heatmap_upstream_3ss]) + + # Save data + plotting_df.to_csv(f'{output_dir}/{FILEname}_RNAmap.tsv', sep="\t", index=False) + + # Plot RNA maps + logging.info("\n" + "="*60) + logging.info("PLOTTING RNA MAPS") + logging.info("="*60) + + sns.set(rc={'figure.figsize':(7, 5)}) + sns.set_style("whitegrid") + + if not all_sites: + col_order = ["upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss"] + titles = ["Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS"] + col_wrap = 4 + else: + col_order = ["upstream_3ss", "upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss", "downstream_5ss"] + titles = ["Upstream 3'SS", "Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS", "Downstream 5'SS"] + col_wrap = 6 + + g = sns.relplot( + data=plotting_df, + x='position', + y='-log10pvalue_smoothed', + hue='name', + col='label', + facet_kws={"sharex":False}, + kind='line', + col_wrap=col_wrap, + height=5, + aspect=4/5, + col_order=col_order + ) + + for ax, title in zip(g.axes.flat, titles): + ax.set_title(title) + ax.axhline(y=0, color='k', alpha=0.2, linewidth=0.5) + fig = plt.gcf() + marker_ax = add_enrichment_marker(fig, ax) + + g.set(xlabel='') + g.axes[0].set_ylabel('-log10(p value) enrichment / control') + + sns.move_legend( + g, "upper right", + bbox_to_anchor=(1, 2), + ncol=1, + title=None, + frameon=False + ) + leg = g._legend + set_legend_text(leg, exon_categories, original_counts) + + # Add exon-intron drawings + rect_fraction = 1 / ((window + 50) / 50) + + for i, ss_type in enumerate(col_order): + ax = g.axes[i] + is_middle = ss_type.startswith('middle_') + exon_color = "midnightblue" if is_middle else "slategrey" + + if ss_type.endswith('_3ss'): + # 3' splice site: exon on right + ax.set_xlim([0, window + 50]) + ticks = np.arange(0, window + 51, 50) + labels = ["" if t in (ticks[0], ticks[-1]) else str(int(t - window)) for t in ticks] + ax.set_xticks(ticks) + ax.set_xticklabels(labels) + + rect = matplotlib.patches.Rectangle( + xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, + color=exon_color, alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + + rect = matplotlib.patches.Rectangle( + xy=(0, -0.15), width=1 - rect_fraction, height=.001, + color="slategrey", alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + else: + # 5' splice site: exon on left + ax.set_xlim([window - 50, window * 2]) + ticks = np.arange(window - 50, window * 2 + 1, 50) + labels = ["" if t in (ticks[0], ticks[-1]) else str(int(t - window)) for t in ticks] + ax.set_xticks(ticks) + ax.set_xticklabels(labels) + + rect = matplotlib.patches.Rectangle( + xy=(0, -0.2), width=rect_fraction, height=.1, + color=exon_color, alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + + rect = matplotlib.patches.Rectangle( + xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, + color="slategrey", alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + + plt.subplots_adjust(wspace=0.05) + plt.savefig(f'{output_dir}/{FILEname}_RNAmap_-log10pvalue.pdf', + bbox_extra_artists=([leg, rect, marker_ax]), + bbox_inches='tight', + pad_inches=0.8) + + logging.info(f"Saved RNA map to {output_dir}/{FILEname}_RNAmap_-log10pvalue.pdf") + + pbt.helpers.cleanup() + + log_runtime(start_time, logger) + logging.info("="*60) + logging.info("SCRIPT COMPLETED SUCCESSFULLY") + logging.info("="*60) + + +# =============================================================================== +# COMMAND LINE INTERFACE +# =============================================================================== + +def main(): + parser = argparse.ArgumentParser( + description='Generate RNA maps from VastDB ID lists', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Example: + python rna_maps_vastdb_only.py \\ + --vastdb_enhanced enhanced_ids.txt \\ + --vastdb_silenced silenced_ids.txt \\ + --vastdb_control control_ids.txt \\ + --vastdb_constitutive constitutive_ids.txt \\ + --vastdb_annotation EVENT_INFO-hg38.tab \\ + -x CLIP.bed -f genome.fa -fi genome.fa.fai \\ + -o output -p PTBP1 + """ + ) + + # Required arguments + required = parser.add_argument_group('Required arguments') + required.add_argument('--vastdb_annotation', required=True, + help='VastDB EVENT_INFO file') + required.add_argument('-x', '--clip', required=True, + help='CLIP crosslinks BED file') + required.add_argument('-f', '--fasta', required=True, + help='Genome FASTA file') + required.add_argument('-fi', '--fasta_index', required=True, + help='Genome FASTA index (.fai)') + required.add_argument('-o', '--output_dir', required=True, + help='Output directory') + required.add_argument('-p', '--prefix', required=True, + help='Output file prefix') + + # VastDB ID lists + vastdb_group = parser.add_argument_group('VastDB ID lists (at least one required)') + vastdb_group.add_argument('--vastdb_enhanced', + help='Enhanced exon IDs (one per line)') + vastdb_group.add_argument('--vastdb_silenced', + help='Silenced exon IDs (one per line)') + vastdb_group.add_argument('--vastdb_control', + help='Control exon IDs (one per line)') + vastdb_group.add_argument('--vastdb_constitutive', + help='Constitutive exon IDs (one per line)') + + # Optional arguments + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('-w', '--window', type=int, default=300, + help='Window around splice sites (default: 300)') + optional.add_argument('-s', '--smoothing', type=int, default=15, + help='Smoothing window (default: 15)') + optional.add_argument('-nc', '--no_constitutive', action='store_true', + help='Exclude constitutive exons') + optional.add_argument('-ns', '--no_subset', action='store_true', + help='Disable subsetting') + optional.add_argument('-ao', '--all_sites', action='store_true', + help='Include all 6 splice sites (default: 4 core sites)') + + args = parser.parse_args() + + # Validate at least one ID list provided + if not any([args.vastdb_enhanced, args.vastdb_silenced, + args.vastdb_control, args.vastdb_constitutive]): + parser.error("At least one VastDB ID list must be provided") + + # Create output directory + os.makedirs(args.output_dir, exist_ok=True) + + # Run RNA maps + run_rna_maps( + enhanced_file=args.vastdb_enhanced, + silenced_file=args.vastdb_silenced, + control_file=args.vastdb_control, + constitutive_file=args.vastdb_constitutive, + event_info_file=args.vastdb_annotation, + xl_bed=args.clip, + fai=args.fasta_index, + genome_fasta=args.fasta, + window=args.window, + smoothing=args.smoothing, + output_dir=args.output_dir, + no_constitutive=args.no_constitutive, + no_subset=args.no_subset, + all_sites=args.all_sites, + prefix=args.prefix + ) + + +if __name__ == '__main__': + main() From 021b2bd587162b424d01bc4a51fa8397ae8ecc24 Mon Sep 17 00:00:00 2001 From: Leo Wilkinson Date: Fri, 13 Mar 2026 13:36:40 +0000 Subject: [PATCH 2/5] Update README to highlight rna_maps_vastdb_only.py for VastDB_input branch Adds dedicated VastDB section at the top covering inputs, usage, outputs, and a comparison table vs the original rMATS-based script. Original rna_maps.py docs preserved below. Co-Authored-By: Claude Sonnet 4.6 --- README.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 65010a2..a5568a2 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,99 @@ ## RNA maps Authors: charlotte.capitanchik@crick.ac.uk; aram.amalietti@gmail.com +--- + +> **Branch: VastDB_input** +> This branch introduces `rna_maps_vastdb_only.py`, a standalone script for generating RNA maps directly from VastDB EVENT IDs — no rMATS required. See [VastDB RNA Maps](#vastdb-rna-maps-rna_maps_vastdb_onlypy) below. + +--- + +## VastDB RNA Maps (`rna_maps_vastdb_only.py`) + +This script generates RNA maps from pre-categorised VastDB cassette exon EVENT IDs. Instead of running differential splicing analysis with rMATS, you supply lists of EVENT IDs already assigned to categories (enhanced, silenced, control, constitutive), along with a VastDB annotation file to look up genomic coordinates. + **Quick Start** -Create a conda environment with all dependencies and activate it: +Create a conda environment and activate it: ``` conda env create -f environment.yml conda activate rnamaps ``` -Then run the test dataset to check the code is working: +Run with VastDB inputs: + +``` +python rna_maps_vastdb_only.py \ + --vastdb_enhanced enhanced_ids.txt \ + --vastdb_silenced silenced_ids.txt \ + --vastdb_control control_ids.txt \ + --vastdb_constitutive constitutive_ids.txt \ + --vastdb_annotation EVENT_INFO-hg38.tab \ + -x CLIP_crosslinks.bed \ + -f genome.fa \ + -fi genome.fa.fai \ + -o output/ \ + -p PTBP1 +``` + +**Preparing inputs** + +- **EVENT ID lists**: Plain text files with one VastDB EVENT ID per line (e.g. `HsaEX0012345`). Lines beginning with `#` are ignored. +- **VastDB annotation file**: The `EVENT_INFO-*.tab` file from VastDB, used to look up genomic coordinates (chromosome, exon start/end, flanking exon coordinates, strand) for each EVENT ID. +- **CLIP crosslinks**: iCLIP or eCLIP crosslinks in BED format. +- **Genome FASTA + index**: Reference genome `.fa` and `.fa.fai` files. + +**Key differences from `rna_maps.py`** + +| Feature | `rna_maps.py` | `rna_maps_vastdb_only.py` | +|---|---|---| +| Splicing input | rMATS `SE.MATS.JCEC.txt` | VastDB EVENT ID lists | +| Category assignment | Computed from dPSI / FDR thresholds | Pre-assigned by user | +| Annotation source | rMATS coordinates | VastDB `EVENT_INFO` file | +| rMATS dependency | Required | Not required | + +**Usage** + +``` +python rna_maps_vastdb_only.py -h + +required arguments: + --vastdb_annotation VastDB EVENT_INFO file + -x, --clip CLIP crosslinks BED file + -f, --fasta Genome FASTA file + -fi, --fasta_index Genome FASTA index (.fai) + -o, --output_dir Output directory + -p, --prefix Output file prefix + +VastDB ID lists (at least one required): + --vastdb_enhanced Enhanced exon IDs (one per line) + --vastdb_silenced Silenced exon IDs (one per line) + --vastdb_control Control exon IDs (one per line) + --vastdb_constitutive Constitutive exon IDs (one per line) + +optional arguments: + -w, --window Window around splice sites [DEFAULT 300] + -s, --smoothing Smoothing window [DEFAULT 15] + -nc, --no_constitutive Exclude constitutive category + -ns, --no_subset Disable subsetting of control/constitutive to match enhanced/silenced counts + -ao, --all_sites Include all 6 splice sites (default: 4 core sites) +``` + +**Outputs** + +- `{prefix}_VastDB_with_categories.tsv` — categorised exons with coordinates +- `{prefix}_RNAmap.tsv` — per-position coverage and enrichment data +- `{prefix}_RNAmap_-log10pvalue.pdf` — RNA map plot +- `execution_*.log` — run log + +--- + +## Original RNA Maps (`rna_maps.py`) + +The original script accepts rMATS quantified splicing files and assigns categories based on dPSI and FDR thresholds. + +**Quick Start** ``` python rna_maps.py \ @@ -19,6 +102,7 @@ python rna_maps.py \ -f test/homosapien-hg37-chr21.fa \ -fi test/homosapien-hg37-chr21.fa.fai ``` + **Preparing RNA-Seq data**: This code accepts rMATs quantified files for cassette exons (e.g. SE.MATS.JCEC.txt). @@ -43,18 +127,7 @@ python rna_maps.py \ ``` If you want to create a multivalency map alone (no CLIP data) simply run the above command with the `-x crosslinks.bed` excluded. -**Dependencies** (these are the versions the script was developped with, pandas >= 1 introduced breaking changes, please use these versions): -``` -python=3.7.7 -pandas=0.24.2 -numpy=1.19.2 -pybedtools=0.8.1 -matplotlib=3.3.2 -seaborn=0.11.0 -scipy=1.3.1 -``` - -**Usage**: +**Usage**: ``` python rna_maps.py -h usage: rna_maps.py [-h] -i INPUTSPLICE [-x [INPUTXLSITES]] -f GENOMEFASTA -fi @@ -112,8 +185,21 @@ options: directory for where to find germs.R for multivalency analysis eg. /Users/Bellinda/repos/germs [DEFAULT current directory] -p PREFIX, --prefix PREFIX - prefix for output files [DEFAULT inputsplice file name]``` + prefix for output files [DEFAULT inputsplice file name] +``` + +**Dependencies** (these are the versions the script was developped with, pandas >= 1 introduced breaking changes, please use these versions): ``` +python=3.7.7 +pandas=0.24.2 +numpy=1.19.2 +pybedtools=0.8.1 +matplotlib=3.3.2 +seaborn=0.11.0 +scipy=1.3.1 +``` + +--- ### Definitions From 6c161e097966d7eb5bbcfe80a6b6dbf4e5d33d9e Mon Sep 17 00:00:00 2001 From: Leo Wilkinson Date: Wed, 18 Mar 2026 19:26:43 +0000 Subject: [PATCH 3/5] merge orig rna_maps.py with Vast_DB input script --- .DS_Store | Bin 16388 -> 10244 bytes rna_maps.py | 2413 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 1408 insertions(+), 1005 deletions(-) diff --git a/.DS_Store b/.DS_Store index cfa1310aacb6e21a285b248e59109903fdfb0b4c..8166c9bda2c3f1cf099732065ea6a43700c18811 100644 GIT binary patch delta 170 zcmZo^U~CDHU|?WibSh0TWMEJLGC6=4L<{gtEEJolCppJAZrt4@ zV2Z^`#a6A4`k<|^+WM~4TJczmJz|fKR*xPvwnBT)B8S;lmhTQ%Mty2Z55lD@|7ik0pycxRV+$9-g6WOSrzd<)v-sv9l`@#{s zeLo0k=^2@mL`gQ8rdV5ATb8%2=vS2Cuy>W$?~3?Fyh~jXZ@6E{4sQ&OwhjaXogwdt z&pX;X5cCfZ2Cyb)z~vtp#>UTe3i|z>!LTpl3kKHoMZB9Mw2q!AuBvZs zI2ejFv6J=r@cg`0F1<#_5bYf9@rJ^fO47cpY$Yf6xT(|3dA8Xl3oC0DH}(w#2M4`@ zNO-m1<@Wk}M|j(H9=x5VkQa}h?l4|=JuPStdVHt*uw;`j;OP#y2EC`!;uWrNq+`e% z@Os9l=UZkJn2M~cv4MB<8drG08}Rr78&;3{JdusPBfhZD?e{8^q#P&}S?l9X#a1Ol zQan6+f@!W*$&{2fo}F(kvnqn9?B-d!)nQW7M8zeszE##TlL8{o)SBu{O1h+krSTd0 zmgy#wvq6$_1g}C^+a_zXRS`vHuP7F4##Zbs;XmV^uGnekrBk(bY6|$dFC=!-NS2dR z$q?B>E+AKu>&V??H@S!WmOM?KBQKJ_k$;f?KpKdU2R1kX>`)C$zzGeo6q;crbi&u5 z3%cPH=!et54FlkZ00iMo2*DU^g|lHBoCDv13*jR8E?foQhabSza1Go9H^VJ(JKO>H zz`gKG_!aDd`{7}D68;3Q@H-T!*P&ZGPqZT(#9tY>o6rxr$>}6WHWLTwCoarKNN6y) zbgA1df$HtUhYvI*T@(N(Qza zeP`0H$;<;sxMfDBKmaiP2f;KPb%2PGlkSQqbQ=^ zPi?6YNR2>h1mZ@3ejjYO)6}V^+jz=89eAMdf!0$9!8ZJjJb;zi)YPe_+jz>lVjtEO zQ{I*O9A3}5W4TSuJJoa>PkDC&ygLE)%Ahh7;N1z>SaSkSo~Dke5lD@|#1Wu8KPg)B zOhijbTMun4??Q|{L@xpVx|1eJ&O6c-3iyM84YVQd>4?m{$UGwM(J<-;44vJlVXoh6 zm}`<~5>M$uYS9&OMTWzTuFyJ~>kbC}Ztb_r-GgMHUzyqz3`AVMfH%ZCAn?H^>w6KZ z2S*!&!vRm2_3h#nq9jWFie+r9(ot?NwK=LCTT5+Y<@VaDQd>=}ed|_9oL0QBZdLy# z|6m|^A$>5sarHAqrtp09coF`JK9MIp;CJI4(%NXDe?x4b?}0<#=PR+PJu{!ol-D=?e&tW6YI@vPZqnGra}r8b;bBFl`x zAyeXA7c7*`oc<{uiFd20l+79!RNBw*)Z)c>X6B`LL01rcIbiia4KVRK4frVh`=y1 zxE;vfEe17Wg|C1eYVkRw=xIl^2z zClEhq^Qf2SwWEOAq;ASEMUg2I5u<5gX8og= zZn` z4Bv!v;ajLOTnv}OWpFvF4A;VS@FTbZbq3TJxYb2%u~oA|p2CBAThwa^IV(P@tH0=x zhWPrJ*EFiv`$_qbon9j98WKDstvi-mCO`Rf%Kw{_>;K2{7E(NhqsRNxcIl^;PKRTqvh| zzzc)u-bUdpl+(8}5&aT$Z`Z@Ea2xy-<@0-CFFXjpfoI`)cmZC7zrZ^}rXUKEAPXj8 zs*o>O*qexfus)vGj>3HWm4w9cATNVtLq?FSaM{UnYm`466Lw+<_@Zc7`s|<>hg^w3 zOao}jBu*hHMzY%ELM4N-51O4d8?%{=chKy-lEOkoz!;Fo0M&wpg|d>y_zbPBD8VcZ z0@2K(8d+A-8MC3;`o)+fDC_guhDJOQ3jxJQgR#XDY69uV8a5CgiRh{6*;j_2)z|59 zPtiM-m%nnUihI<)gTWZ?F{;<&p0jVdgxc67?YbCw942o4)$yj5Pi&#ThsZy7?6`FE zLk&i)Vt10fmHvKI%~B=D5-p+{8-@ffNS5~>pBu`!e+sw$Zx zQC03RQY6l&Ly6y0{#hjx6p3$RrQwt5$6Kg=?2e@EVxq($Z3N;(iSE@svf*U>Wrts7&z6M5jm@?U6gh*avum9s^B!YkEy@~viD9mKe4yQnr1tvPf(vO)Rs%|NqX*5636}Ega=EwhwOU%!k zZkDM!K_|2-PbBq;M_{nV!bBEgwIUQI=AtZ7%7lp;CYn(*JsD+* zdRKYX`@8M?ZpWn#N!!L8=KpG|cbmc@=Ee=LpCDyqvc(u7M8u9bmwLZ8VWKYeSPT7y zV`F8Nb_a_4WtBBt>aSdcjqkjo$mDkhTj5|ld;Oo|Q>CnvQ5@V^Tvg24gswxRC8Q`r0U{NvqRrFn*DnrKh*2Z_Qf6l{rVV+=sf^xRq?5(jvEA zS$PG=)75RYOPX3%bdEFG*)+|hi@?NI|8R1mQ4-DtaLt%Bo}HuQqM&SvhF>rgBaY`P zIoT$Q)rQjV?s%|6Yb+M)tOT&bJItMj9b6pjuo6?*0twL$2Rp2*$x$KVA~+T7@S;UE zSkx8;JG`)du}Bxs#lQ|Lwlp--wQY%Dhu1AT8S5U^gB`WJ%@FL+M{l=UEU^#_eexV7 z+p@~iEn>*UAP{qm*3>;joW zQH{xTu+?=mD&s=UI|xH8GP8xL-P_?jwyJI?uB*EmV>7!jHghMUAq@vTf$^D_-~h&F z4#MH6Brr|L6Q&FEg;JqPSR%9u-SpFkxwG*=@7z~SMNyjY`Ql=;NAb6fo~#N z%P2hfs;>W~tJ=8;Znyx$Sn8ls0E*^`QU@)LmpWKUoYcX(#!DTnC{gNQg%hO?R-8oY zV08vkhgxndb+F1rsUxNT)33F*rS$)g{}TvR-sw_|FR}hlm)+(Qqlne5= window_size: - # Get coverage values - values = group_sorted['coverage'].values - - # Create a Series with integer indices (not the DataFrame indices) - # This ensures clean math without index alignment issues - s = pd.Series(values) - - # Apply rolling window - smoothed = s.rolling( - window=window_size, - center=True, - win_type='gaussian' - ).mean(std=std) - - # Fill NaN values at the edges with original values - smoothed = smoothed.fillna(s) - - # Update the coverage in the original group - # This works because the sorted indices correspond to the smoothed series - group_sorted['coverage'] = smoothed.values - - # Add the processed group to our list - groups.append(group_sorted) - - # Combine all groups back into a single DataFrame - return pd.concat(groups) +sns.set_style("whitegrid", {'legend.frameon': True}) + +colors_dict = { + 'all': '#D3D3D3', + 'ctrl': '#408F76', + 'enh': '#F30C08', + 'sil': '#005299', + 'enhrest': '#FFB122', + 'silrest': '#6DC2F5', + 'const': '#666666' +} +linewidth = 3 +dashes = False + + +# =============================================================================== +# UTILITY FUNCTIONS +# =============================================================================== def setup_logging(output_path): - """Sets up logging to file and console, and returns the log filename + start time.""" + """Sets up logging to file and console.""" timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") log_filename = f"execution_{timestamp}.log" log_path = os.path.join(output_path, log_filename) - # Get root logger and reset handlers to avoid duplicates logger = logging.getLogger() logger.handlers.clear() logger.setLevel(logging.INFO) - # Create file handler file_handler = logging.FileHandler(log_path, mode='w') file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) - # Create stream handler (console output) stream_handler = logging.StreamHandler() stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) - # Add handlers logger.addHandler(file_handler) logger.addHandler(stream_handler) - # Ensure file writes are flushed automatically file_handler.flush = file_handler.stream.flush - # Capture start time start_time = time.time() - # Set higher logging level for matplotlib (only show WARNING and above) + logging.getLogger('matplotlib').setLevel(logging.WARNING) logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) logging.getLogger('matplotlib.backends').setLevel(logging.ERROR) - - # Also suppress fontTools logging (source of many of those messages) logging.getLogger('fontTools').setLevel(logging.ERROR) logging.getLogger('fontTools.subset').setLevel(logging.ERROR) @@ -111,447 +114,1263 @@ def setup_logging(output_path): return log_filename, start_time, logger + def log_runtime(start_time, logger): - """Logs the script execution time in minutes and seconds format.""" + """Logs the script execution time.""" end_time = time.time() total_seconds = int(end_time - start_time) - minutes, seconds = divmod(total_seconds, 60) runtime_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s" - - # Ensure log is written immediately logger.info(f"Script completed in {runtime_str}.") for handler in logger.handlers: handler.flush() -sns.set_style("whitegrid", {'legend.frameon':True}) -colors_dict = {'all': '#D3D3D3', 'ctrl': '#408F76', 'enh': '#F30C08', 'sil': '#005299', 'enhrest': '#FFB122', 'silrest': '#6DC2F5', 'const': '#666666'} -linewidth = 3 -dashes = False +def smooth_coverage(df, window_size=10, std=2): + """Smooth coverage data using rolling Gaussian window.""" + result = df.copy() + groups = [] -def cli(): - parser = argparse.ArgumentParser(description='Plot CLIP crosslinks around regulated exons to study position-dependent impact on pre-mRNA splicing.') - optional = parser._action_groups.pop() - required = parser.add_argument_group('required arguments') - required.add_argument('-i',"--inputsplice", type=str, required=True, - help='quantification of differential splicing produced by rMATS') - optional.add_argument('-x',"--inputxlsites", type=str, nargs='?', - help='CLIP crosslinks in BED file format') - required.add_argument('-f',"--genomefasta", type=str, required=True, - help='genome fasta file (.fa)') - required.add_argument('-fi',"--fastaindex", type=str, required=True, - help='genome fasta index file (.fai)') - optional.add_argument('-o',"--outputpath", type=str, default=os.getcwd(), nargs='?', - help='output folder [DEFAULT current directory]') - optional.add_argument('-w',"--window", type=int, default=300, nargs='?', - help='window around regulated splicing events to plot crosslinks [DEFAULT 300]') - optional.add_argument('-s',"--smoothing", type=int, default=15, nargs='?', - help='smoothing window for plotting crosslink signal [DEFAULT 15]') - optional.add_argument('-mc',"--minctrl", type=float, default=-0.05, nargs='?', - help='minimum dPSI for control events [DEFAULT -0.05]') - optional.add_argument('-xc',"--maxctrl", type=float, default=0.05, nargs='?', - help='maximum dPSI for control events [DEFAULT 0.05]') - optional.add_argument('-xi',"--maxincl", type=float, default=0.9, nargs='?', - help='maximum PSI for control exons, above this limit exons are considered constitutive [DEFAULT 0.9]') - optional.add_argument('-xf',"--maxfdr", type=float, default=0.1, nargs='?', - help='maximum FDR for regulated events, above this events fall in "rest" class, is used for rMATS [DEFAULT 0.1]') - optional.add_argument('-xe',"--maxenh", type=float, default=-0.05, nargs='?', - help='maximum inclusion for exons to be considered enhanced [DEFAULT -0.05]') - optional.add_argument('-ms',"--minsil", type=float, default=0.05, nargs='?', - help='minimum inclusion for exons to be considered silenced [DEFAULT 0.05]') - optional.add_argument('-v','--multivalency', action="store_true") - optional.add_argument('-nc','--no_constitutive', action="store_true", - help='Exclude constitutive category from the output') - optional.add_argument('-ns','--no_subset', action="store_true", - help='Disable subsetting of control/constitutive exons to match enhanced/silenced counts') - optional.add_argument('-ao','--all_sites', action="store_true", - help='Include all splice sites (upstream_3ss and downstream_5ss), default is core sites only') - optional.add_argument('-g',"--germsdir", type=str, default=os.getcwd(), nargs='?', - help='directory for where to find germs.R for multivalency analysis eg. /Users/Bellinda/repos/germs [DEFAULT current directory]') - optional.add_argument('-p',"--prefix", type=str, - help='prefix for output files [DEFAULT inputsplice file name]') - parser._action_groups.append(optional) - args = parser.parse_args() + for (exon_id, label), group_df in result.groupby(['exon_id', 'label']): + if len(group_df) < 3: + groups.append(group_df) + continue + + group_sorted = group_df.sort_values('position') + + if len(group_sorted) >= window_size: + values = group_sorted['coverage'].values + s = pd.Series(values) - logging.info(args) - - return( - args.inputsplice, - args.inputxlsites, - args.genomefasta, - args.fastaindex, - args.outputpath, - args.window, - args.smoothing, - args.minctrl, - args.maxctrl, - args.maxincl, - args.maxfdr, - args.maxenh, - args.minsil, - args.multivalency, - args.germsdir, - args.no_constitutive, - args.no_subset, - args.all_sites, - args.prefix + smoothed = s.rolling( + window=window_size, + center=True, + win_type='gaussian' + ).mean(std=std) + + smoothed = smoothed.fillna(s) + group_sorted['coverage'] = smoothed.values + + groups.append(group_sorted) + + return pd.concat(groups) + + +# =============================================================================== +# TRACK 1: rMATS DATA LOADING +# =============================================================================== + +def load_rmats_data(de_file, min_ctrl, max_ctrl, max_inclusion, + max_fdr, max_enh, min_sil, chroms, no_constitutive): + """ + Load and categorise exons from rMATS output. + + Reads rMATS SE file, computes maxPSI, deduplicates, assigns categories + from dPSI/FDR thresholds, and corrects upstream/downstream labels for + minus-strand genes (rMATS labels by genomic position, not transcript order). + + Returns DataFrame with canonical columns for the shared pipeline. + """ + logging.info("=" * 60) + logging.info("INPUT MODE: rMATS") + logging.info("=" * 60) + + rmats = pd.read_csv(de_file, sep='\t') + + if 'exonStart_0base' not in rmats.columns: + raise ValueError( + "Input file does not appear to be rMATS format " + "(missing 'exonStart_0base' column)" ) -def df_apply(col_fn, *col_names): - def inner_fn(df): - cols = [df[col] for col in col_names] - return col_fn(*cols) - return inner_fn + rmats = rmats[rmats['chr'].isin(chroms)] + + # Compute max PSI across all samples + rmats['inclusion'] = ( + rmats.IncLevel1.str.split(',') + rmats.IncLevel2.str.split(',') + ) + rmats['inclusion'] = rmats['inclusion'].apply( + lambda x: max([float(y) for y in x if y != 'NA']) + ) + + df_rmats = rmats.loc[:, [ + 'chr', 'exonStart_0base', 'exonEnd', 'FDR', 'IncLevelDifference', + 'strand', 'inclusion', + 'upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE' + ]].rename(columns={ + 'IncLevelDifference': 'dPSI', + 'inclusion': 'maxPSI' + }).reset_index() + + # Deduplicate: keep the most extreme dPSI per exon + mask = df_rmats.groupby( + ['chr', 'exonStart_0base', 'exonEnd', 'strand'] + )['dPSI'].transform(lambda x: abs(x).rank(ascending=False)) < 2 + df_rmats = df_rmats[mask] + + # Assign categories from thresholds + conditions = [ + (df_rmats["dPSI"].gt(min_sil) & df_rmats["FDR"].lt(max_fdr)), # silenced + (df_rmats["dPSI"].lt(max_enh) & df_rmats["FDR"].lt(max_fdr)), # enhanced + (df_rmats["dPSI"].gt(min_ctrl) & df_rmats["dPSI"].lt(max_ctrl) + & df_rmats["maxPSI"].gt(max_inclusion)), # constitutive + (df_rmats["dPSI"].gt(min_ctrl) & df_rmats["dPSI"].lt(max_ctrl)), # control + ] + choices = ["silenced", "enhanced", "constituitive", "control"] + df_rmats["category"] = np.select(conditions, choices, default=None) + + # Filter out constitutive if requested + if no_constitutive: + df_rmats = df_rmats[df_rmats['category'] != 'constituitive'] + + # --------------------------------------------------------------- + # FIX: Swap upstream/downstream for minus-strand genes. + # + # rMATS labels "upstream" and "downstream" by genomic coordinate + # (lower = upstream, higher = downstream). For minus-strand genes + # this is inverted relative to transcript order. We swap here so + # that after loading, upstream/downstream always mean transcript- + # relative direction, matching VastDB's CO_C1/CO_C2 convention. + # --------------------------------------------------------------- + minus_mask = df_rmats['strand'] == '-' + cols_to_swap = [('upstreamES', 'downstreamES'), ('upstreamEE', 'downstreamEE')] + for col_a, col_b in cols_to_swap: + df_rmats.loc[minus_mask, [col_a, col_b]] = \ + df_rmats.loc[minus_mask, [col_b, col_a]].values + + logging.info(f"Loaded {len(df_rmats)} categorised exons from rMATS") + logging.info(f"Category distribution:\n{df_rmats['category'].value_counts()}") + + return df_rmats + + +# =============================================================================== +# TRACK 2: VASTDB DATA LOADING +# =============================================================================== + +def parse_event_info_coordinates(event_info_df): + """ + Parse genomic coordinates from EVENT_INFO file. + Returns DataFrame with exon and flanking exon coordinates. + """ + coords_list = [] + + for idx, row in event_info_df.iterrows(): + coord_str = str(row['COORD_o']) + match = re.match(r'(chr[\w]+):(\d+)-(\d+)', coord_str) + if not match: + continue + + ref_co = str(row.get('REF_CO', '')) + strand_match = re.search(r':([+-])$', ref_co) + strand = strand_match.group(1) if strand_match else '+' + + upstream_start, upstream_end = np.nan, np.nan + downstream_start, downstream_end = np.nan, np.nan + + if pd.notna(row.get('CO_C1', np.nan)): + co_c1_match = re.match(r'chr[\w]+:(\d+)-(\d+)', str(row['CO_C1'])) + if co_c1_match: + upstream_start = int(co_c1_match.group(1)) + upstream_end = int(co_c1_match.group(2)) + + if pd.notna(row.get('CO_C2', np.nan)): + co_c2_match = re.match(r'chr[\w]+:(\d+)-(\d+)', str(row['CO_C2'])) + if co_c2_match: + downstream_start = int(co_c2_match.group(1)) + downstream_end = int(co_c2_match.group(2)) + + coords_list.append({ + 'EVENT': row['EVENT'], + 'GENE': row['GENE'], + 'chr': match.group(1), + 'exonStart_1based': int(match.group(2)), + 'exonEnd': int(match.group(3)), + 'strand': strand, + 'upstreamES': upstream_start, + 'upstreamEE': upstream_end, + 'downstreamES': downstream_start, + 'downstreamEE': downstream_end, + }) + + coords_df = pd.DataFrame(coords_list) + + if len(coords_df) > 0: + logging.info(f"Parsed coordinates for {len(coords_df)} events") + + return coords_df + + +def load_vastdb_data(enhanced_file, silenced_file, control_file, + constitutive_file, event_info_file, chroms): + """ + Load VastDB ID lists and look up coordinates from EVENT_INFO. + + Categories are assigned by list membership. Coordinates are converted + from 1-based (VastDB) to 0-based (BED). CO_C1/CO_C2 are already in + transcript order, so no strand correction is needed. + + Returns DataFrame with canonical columns for the shared pipeline. + """ + logging.info("=" * 60) + logging.info("INPUT MODE: VastDB ID Lists") + logging.info("=" * 60) + + # Read ID lists + def read_id_list(filepath, category): + if filepath is None: + return [] + with open(filepath) as f: + ids = [line.strip() for line in f + if line.strip() and not line.startswith('#')] + logging.info(f"Loaded {len(ids)} {category} IDs") + return ids + + enhanced_ids = read_id_list(enhanced_file, 'enhanced') + silenced_ids = read_id_list(silenced_file, 'silenced') + control_ids = read_id_list(control_file, 'control') + constitutive_ids = read_id_list(constitutive_file, 'constitutive') + + all_ids = enhanced_ids + silenced_ids + control_ids + constitutive_ids + + if len(all_ids) == 0: + raise ValueError("No EVENT IDs provided in any file!") + + logging.info(f"Total EVENT IDs: {len(all_ids)}") + + # Create ID to category mapping + id_to_category = {} + for eid in enhanced_ids: + id_to_category[eid] = 'enhanced' + for eid in silenced_ids: + id_to_category[eid] = 'silenced' + for eid in control_ids: + id_to_category[eid] = 'control' + for eid in constitutive_ids: + id_to_category[eid] = 'constituitive' # Match original spelling + + # Load EVENT_INFO + logging.info(f"Loading EVENT_INFO: {event_info_file}") + event_info = pd.read_csv(event_info_file, sep='\t') + logging.info(f"EVENT_INFO contains {len(event_info)} events") + + # Parse coordinates + event_coords = parse_event_info_coordinates(event_info) + + # Filter to our IDs + event_coords_subset = event_coords[event_coords['EVENT'].isin(all_ids)].copy() + + n_matched = len(event_coords_subset) + n_total = len(all_ids) + logging.info(f"Matched {n_matched}/{n_total} IDs to coordinates " + f"({100 * n_matched / n_total:.1f}%)") + + if n_matched == 0: + raise ValueError("No EVENT IDs matched to coordinates!") + + # Convert 1-based to 0-based + event_coords_subset['exonStart_0base'] = event_coords_subset['exonStart_1based'] - 1 + event_coords_subset['upstreamES'] = event_coords_subset['upstreamES'] - 1 + event_coords_subset['downstreamES'] = event_coords_subset['downstreamES'] - 1 + + # Assign categories + event_coords_subset['category'] = event_coords_subset['EVENT'].map(id_to_category) + + # Add FDR placeholder (not used but needed for column contract) + event_coords_subset['FDR'] = 0.001 + + # Filter to valid chromosomes + event_coords_subset = event_coords_subset[event_coords_subset['chr'].isin(chroms)] + + logging.info(f"Category distribution:\n{event_coords_subset['category'].value_counts()}") + + return event_coords_subset + + +# =============================================================================== +# SHARED PIPELINE FUNCTIONS +# =============================================================================== + +def apply_subsetting(df_rmats, no_constitutive): + """ + Subset control and constitutive exons to match the largest regulated + category count. Returns (subsetted df, original_counts dict). + """ + category_counts = df_rmats['category'].value_counts() + original_counts = {cat: count for cat, count in category_counts.items()} + + target_count = 0 + if 'enhanced' in category_counts and 'silenced' in category_counts: + target_count = max(category_counts['enhanced'], category_counts['silenced']) + elif 'enhanced' in category_counts: + target_count = category_counts['enhanced'] + elif 'silenced' in category_counts: + target_count = category_counts['silenced'] + + # Subset control + if 'control' in category_counts and category_counts['control'] > target_count > 0: + control_indices = df_rmats[df_rmats['category'] == 'control'].index + control_keep = np.random.choice(control_indices, target_count, replace=False) + drop_mask = df_rmats.index.isin(control_indices) & ~df_rmats.index.isin(control_keep) + df_rmats = df_rmats[~drop_mask] + logging.info(f"Subsetted control exons from {category_counts['control']} to {target_count}") + + # Subset constitutive + if (not no_constitutive + and 'constituitive' in category_counts + and category_counts['constituitive'] > target_count > 0): + const_indices = df_rmats[df_rmats['category'] == 'constituitive'].index + const_keep = np.random.choice(const_indices, target_count, replace=False) + drop_mask = df_rmats.index.isin(const_indices) & ~df_rmats.index.isin(const_keep) + df_rmats = df_rmats[~drop_mask] + logging.info(f"Subsetted constitutive exons from " + f"{category_counts['constituitive']} to {target_count}") + + return df_rmats, original_counts + def get_ss_bed(df, pos_col, neg_col): - df['exon_id'] = df['category'] + "_" + df['chr'].astype(str) + ":" + df['exonStart_0base'].astype(str) + "-" + df['exonEnd'].astype(str) + ";" + df['strand'].astype(str) + """ + Create BED file for splice sites (handles strand orientation). + + pos_col: column to use for + strand (one end of the exon) + neg_col: column to use for - strand (other end of same exon) - ss_pos = df.loc[df['strand'] == "+", ['chr', pos_col, pos_col, 'exon_id', 'FDR', 'strand']] + Both columns should reference the SAME exon. The upstream/downstream + strand correction happens at load time, not here. + """ + df = df.copy() + + df['exon_id'] = ( + df['category'] + "_" + + df['chr'].astype(str) + ":" + + df['exonStart_0base'].astype(str) + "-" + + df['exonEnd'].astype(str) + ";" + + df['strand'].astype(str) + ) + + ss_pos = df.loc[df['strand'] == "+", + ['chr', pos_col, pos_col, 'exon_id', 'FDR', 'strand']] ss_pos.columns = ['chr', 'start', 'end', 'name', 'score', 'strand'] - ss_pos.start = ss_pos.start.transform(lambda x: x-1) + ss_pos.start = ss_pos.start.transform(lambda x: x - 1) - ss_n = df.loc[df['strand'] == "-", ['chr', neg_col, neg_col, 'exon_id', 'FDR', 'strand']] + ss_n = df.loc[df['strand'] == "-", + ['chr', neg_col, neg_col, 'exon_id', 'FDR', 'strand']] ss_n.columns = ['chr', 'start', 'end', 'name', 'score', 'strand'] - ss_n.end = ss_n.end.transform(lambda x: x+1) + ss_n.end = ss_n.end.transform(lambda x: x + 1) ss = pd.concat([ss_pos, ss_n]) return ss -def get_coverage_plot(xl_bed, df, fai, window, exon_categories, label): - """Return coverage of xl_bed items around df features extended by windows""" + +def get_coverage_plot(xl_bed, df, fai, window, exon_categories, label, + smoothing=15): + """ + Calculate coverage of CLIP crosslinks around splice sites with + Fisher's exact test enrichment against control exons. + """ df = df.loc[df.name != "."] xl_bed = pbt.BedTool(xl_bed).sort() - pbt_df = pbt.BedTool.from_dataframe(df[['chr', 'start', 'end', 'name', 'score', 'strand']]).sort().slop(l=window, r=window, s=True, g=fai) - df_coverage = pbt_df.coverage(b=xl_bed, **{'sorted': True, 's': True, 'd': True, 'nonamecheck': True}).to_dataframe()[['thickStart', 'thickEnd', 'strand', 'name']] - df_coverage.rename(columns=({'thickStart':'position','thickEnd':'coverage'}), inplace=True) + pbt_df = pbt.BedTool.from_dataframe( + df[['chr', 'start', 'end', 'name', 'score', 'strand']] + ).sort().slop(l=window, r=window, s=True, g=fai) + + df_coverage = pbt_df.coverage( + b=xl_bed, **{'sorted': True, 's': True, 'd': True, 'nonamecheck': True} + ).to_dataframe()[['thickStart', 'thickEnd', 'strand', 'name']] + + df_coverage.rename( + columns={'thickStart': 'position', 'thickEnd': 'coverage'}, inplace=True + ) df_plot = df_coverage - - df_plot.loc[df_plot.strand=='+', 'position'] = df_plot['position'].astype('int32') - df_plot.loc[df_plot.strand=='-', 'position'] = abs(2 * window + 2 - df_plot['position']) + + # Adjust positions based on strand + df_plot.loc[df_plot.strand == '+', 'position'] = \ + df_plot['position'].astype('int32') + df_plot.loc[df_plot.strand == '-', 'position'] = \ + abs(2 * window + 2 - df_plot['position']) df_plot = df_plot.loc[df_plot.name != "."] - df_plot =df_plot.join( df_plot.pop('name').str.split('_',n=1,expand=True).rename(columns={0:'name', 1:'exon_id'}) ) + df_plot = df_plot.join( + df_plot.pop('name').str.split('_', n=1, expand=True).rename( + columns={0: 'name', 1: 'exon_id'} + ) + ) - heatmap_plot = df_plot + heatmap_plot = df_plot.copy() heatmap_plot['label'] = label - df_plot = df_plot.groupby(['name','position'], as_index=False).agg({'coverage':'sum'}) - - exon_cat = pd.DataFrame({'name':exon_categories.index, 'number_exons':exon_categories.values}) - df_plot = df_plot.merge(exon_cat, how="left") + # Aggregate coverage + df_plot = df_plot.groupby( + ['name', 'position'], as_index=False + ).agg({'coverage': 'sum'}) - df_plot['norm_coverage'] = np.where(df_plot['coverage'] == 0, df_plot['coverage'], df_plot['coverage']/df_plot['number_exons']) + exon_cat = pd.DataFrame({ + 'name': exon_categories.index, + 'number_exons': exon_categories.values + }) + df_plot = df_plot.merge(exon_cat, how="left") - # fisher test [covered exons, non-covered exon, control covered exons, control non-covered exons] - df_plot_ctrl = df_plot.loc[df_plot.name == "control"][["position","coverage","number_exons"]] - df_plot_ctrl.columns = ["position","control_coverage","control_number_exons"] + df_plot['norm_coverage'] = np.where( + df_plot['coverage'] == 0, + df_plot['coverage'], + df_plot['coverage'] / df_plot['number_exons'] + ) + + # Fisher test against control + df_plot_ctrl = df_plot.loc[ + df_plot.name == "control" + ][["position", "coverage", "number_exons"]] + df_plot_ctrl.columns = ["position", "control_coverage", "control_number_exons"] df_plot = df_plot.merge(df_plot_ctrl, how="left") - df_plot['control_norm_coverage'] = df_plot["control_coverage"] / df_plot["control_number_exons"] - # give 0 values a small pseudo-count so that fold change can be calculated - df_plot.loc[df_plot['control_norm_coverage'] == 0, ['control_norm_coverage']] = 0.000001 - df_plot['fold_change'] = df_plot["norm_coverage"] / df_plot["control_norm_coverage"] - - contingency_table = list(zip(df_plot['coverage'], df_plot['number_exons']-df_plot['coverage'], df_plot['control_coverage'], df_plot['control_number_exons'] - df_plot['control_coverage'])) - contingency_table = [ np.array(table).reshape(2,2) for table in contingency_table ] - df_plot['pvalue'] = [ stats.fisher_exact(table)[1] for table in contingency_table ] - - df_plot['-log10pvalue'] = np.log10(1/df_plot['pvalue']) + df_plot['control_norm_coverage'] = ( + df_plot["control_coverage"] / df_plot["control_number_exons"] + ) + + # Pseudo-count for zero values + df_plot.loc[ + df_plot['control_norm_coverage'] == 0, ['control_norm_coverage'] + ] = 0.000001 + df_plot['fold_change'] = ( + df_plot["norm_coverage"] / df_plot["control_norm_coverage"] + ) + + # Statistical test + contingency_table = list(zip( + df_plot['coverage'], + df_plot['number_exons'] - df_plot['coverage'], + df_plot['control_coverage'], + df_plot['control_number_exons'] - df_plot['control_coverage'] + )) + contingency_table = [np.array(table).reshape(2, 2) + for table in contingency_table] + df_plot['pvalue'] = [stats.fisher_exact(table)[1] + for table in contingency_table] + + df_plot['-log10pvalue'] = np.log10(1 / df_plot['pvalue']) df_plot['label'] = label - df_plot.loc[df_plot['fold_change'] < 1, ['-log10pvalue']] = df_plot['-log10pvalue'] * -1 - df_plot['-log10pvalue_smoothed'] = df_plot['-log10pvalue'].rolling(smoothing, center=True, win_type="gaussian").mean(std=2) + df_plot.loc[ + df_plot['fold_change'] < 1, ['-log10pvalue'] + ] = df_plot['-log10pvalue'] * -1 + df_plot['-log10pvalue_smoothed'] = df_plot['-log10pvalue'].rolling( + smoothing, center=True, win_type="gaussian" + ).mean(std=2) return df_plot, heatmap_plot + def set_legend_text(legend, exon_categories, original_counts=None): - """ - Set the legend text with optional subset information. - - Parameters: - ---------- - legend : matplotlib.legend.Legend - The legend object to modify - exon_categories : pandas.Series - Series containing category counts - original_counts : dict, optional - Dictionary with original counts before subsetting - """ - # Convert exon_categories to a DataFrame for easier handling - exon_cat = pd.DataFrame({'name': exon_categories.index, 'number_exons': exon_categories.values}) - - # Determine which categories are present + """Set legend text with optional subset information.""" + exon_cat = pd.DataFrame({ + 'name': exon_categories.index, + 'number_exons': exon_categories.values + }) categories = exon_cat['name'].unique() legend.set_bbox_to_anchor([1.08, 0.75]) legend.set_title("") - - # Track index in legend texts + legend_idx = 0 - - # Set text for each category in a specific order for category in ['constituitive', 'control', 'enhanced', 'silenced']: if category in categories: count = exon_cat[exon_cat['name'] == category]['number_exons'].values[0] text = f"{category.capitalize()} ({count}" - - # Add subset information if applicable if original_counts and category in original_counts: orig_count = original_counts[category] if orig_count > count: text += f", subset from {orig_count}" - text += ")" legend.texts[legend_idx].set_text(text) - legend_idx += 1 + legend_idx += 1 + def add_enrichment_marker(fig, ax): - """ - Add text-based enrichment/depletion marker with arrows on the right side of the figure, - centered around y=0, dynamically accounting for the position of y=0. - """ - # Get the y-axis limits of the current plot (ax) + """Add enrichment/depletion marker with arrows.""" y_min, y_max = ax.get_ylim() - - # Position of y=0 relative to the y-axis - y_zero_position = 0 # We're centering around y=0, no matter where it is on the y-axis - - # Calculate the height and position of the marker axis - marker_height = 0.4 # Adjust this based on your plot, it defines how tall the marker is - - # Calculate the normalized position of the marker to be centered around y=0 - normalized_bottom = (y_zero_position - y_min) / (y_max - y_min) - marker_height / 2 - - # Create a new axis on the right side of the figure - marker_ax = fig.add_axes([0.94, normalized_bottom, 0.06, marker_height], label=f"enrichment_marker_{ax.get_title()}") - - # Remove all axis elements + marker_height = 0.4 + normalized_bottom = (0 - y_min) / (y_max - y_min) - marker_height / 2 + + marker_ax = fig.add_axes( + [0.94, normalized_bottom, 0.06, marker_height], + label=f"enrichment_marker_{ax.get_title()}" + ) marker_ax.set_xticks([]) marker_ax.set_yticks([]) for spine in marker_ax.spines.values(): spine.set_visible(False) - - # Position the arrows with text centered at the middle of the marker - marker_ax.text(0.5, 0.5, "↑\nenriched\n\n\n↓\ndepleted", - ha='center', va='center', fontsize=11, - transform=marker_ax.transAxes) - - return marker_ax + marker_ax.text( + 0.5, 0.5, "↑\nenriched\n\n\n↓\ndepleted", + ha='center', va='center', fontsize=11, + transform=marker_ax.transAxes + ) + return marker_ax -def get_multivalency_scores(df, fai, window, genome_fasta, output_dir, name, type, germsdir): - """Return multivalency scores around df features extended by windows""" - df = df.loc[(df.name != ".") & (pd.notnull(df.name)) & (df.name != "None")] +def get_multivalency_scores(df, fai, window, genome_fasta, output_dir, + name, type, germsdir): + """Return multivalency scores around df features extended by windows.""" + df = df.loc[ + (df.name != ".") & (pd.notnull(df.name)) & (df.name != "None") + ] df = df[['chr', 'start', 'end', 'name', 'score', 'strand']] df.columns = ['chr', 'start', 'stop', 'name', 'score', 'strand'] - df['name'] = df['name'].apply(lambda x: (str(x) + "XX" + random.choice(list(string.ascii_lowercase)) + random.choice(list(string.ascii_lowercase)) + random.choice(list(string.ascii_lowercase)) + random.choice(list(string.ascii_lowercase)) + random.choice(list(string.ascii_lowercase)) + random.choice(list(string.ascii_lowercase)))) - pbt_df = pbt.BedTool.from_dataframe(df[['chr', 'start', 'stop', 'name', 'score', 'strand']]).sort().slop(l=2*window, r=2*window, s=True, g=fai) + df['name'] = df['name'].apply( + lambda x: (str(x) + "XX" + + ''.join(random.choice(string.ascii_lowercase) for _ in range(6))) + ) + pbt_df = pbt.BedTool.from_dataframe( + df[['chr', 'start', 'stop', 'name', 'score', 'strand']] + ).sort().slop(l=2 * window, r=2 * window, s=True, g=fai) logging.info("Number of sites: " + str(len(pbt_df))) - pbts = pbt.BedTool.filter(pbt_df, lambda x: len(x) == (4*window) + 1).saveas() - logging.info("Number of seqs considered after filtering those that run off the end of chroms: " + str(len(pbts))) - pbts.sequence(fi=genome_fasta,name=True).save_seqs(f'{output_dir}/{name}_{type}_temp.fa') + pbts = pbt.BedTool.filter( + pbt_df, lambda x: len(x) == (4 * window) + 1 + ).saveas() + logging.info( + "Number of seqs after filtering off-chrom: " + str(len(pbts)) + ) + pbts.sequence(fi=genome_fasta, name=True).save_seqs( + f'{output_dir}/{name}_{type}_temp.fa' + ) logging.info("Running germs to calculate multivalency scores...") - os.system("RScript --vanilla " + germsdir + "/germs.R -f " + f'{output_dir}/{name}_{type}_temp.fa' + " -w 100 -s 20") + os.system( + "RScript --vanilla " + germsdir + "/germs.R -f " + + f'{output_dir}/{name}_{type}_temp.fa' + " -w 100 -s 20" + ) os.system("gunzip -f *multivalency.tsv.gz") - mdf = pd.read_csv(f'{output_dir}/{name}_{type}_temp_5_101_21.multivalency.tsv', sep='\t', header=0) + mdf = pd.read_csv( + f'{output_dir}/{name}_{type}_temp_5_101_21.multivalency.tsv', + sep='\t', header=0 + ) os.system(f'rm {output_dir}/{name}_{type}_temp_5_101_21.multivalency.tsv') os.system(f'rm {output_dir}/{name}_{type}_temp.fa') - mdf['position'] = np.tile(np.arange(0, 4*window - 3), len(pbts)) - - mdf[['exon_type','label','roname']] = mdf['sequence_name'].str.split(r'XX|_',expand=True) + mdf['position'] = np.tile(np.arange(0, 4 * window - 3), len(pbts)) + mdf[['exon_type', 'label', 'roname']] = mdf['sequence_name'].str.split( + r'XX|_', expand=True + ) - # GET MOST CONTRIBUTING KMERS - # Filter for 'exon_type' being 'enhanced' or 'silenced' + # Top contributing kmers filtered_mdf = mdf[mdf['exon_type'].isin(['enhanced', 'silenced'])] - # Separate dataframes for enhanced and silenced exon_types enhanced_df = filtered_mdf[filtered_mdf['exon_type'] == 'enhanced'] silenced_df = filtered_mdf[filtered_mdf['exon_type'] == 'silenced'] - # Function to calculate top 10 kmers for a given DataFrame + def calculate_top_kmers(df): kmer_scores = df.groupby('kmer')['smoothed_kmer_multivalency'].sum() top_kmers = kmer_scores.nlargest(5).index return df[df['kmer'].isin(top_kmers)] - # Calculate top 10 kmers for enhanced and silenced exon_types separately + top_kmers_enhanced = calculate_top_kmers(enhanced_df) top_kmers_silenced = calculate_top_kmers(silenced_df) - - # Combine the top kmers for enhanced and silenced into one table top_kmers_df = pd.concat([top_kmers_enhanced, top_kmers_silenced]) - mdf = mdf.groupby(['exon_type','position'], as_index=False).agg({'smoothed_kmer_multivalency':'mean'}).reset_index() - top_kmers_df = top_kmers_df.groupby(['position','exon_type','kmer'], as_index=False).agg({'smoothed_kmer_multivalency':'mean'}).reset_index() + mdf = mdf.groupby( + ['exon_type', 'position'], as_index=False + ).agg({'smoothed_kmer_multivalency': 'mean'}).reset_index() + top_kmers_df = top_kmers_df.groupby( + ['position', 'exon_type', 'kmer'], as_index=False + ).agg({'smoothed_kmer_multivalency': 'mean'}).reset_index() mdf['type'] = type top_kmers_df['type'] = type - print(mdf.head()) - print(top_kmers_df.head()) + return mdf, top_kmers_df + + +# =============================================================================== +# SHARED PLOTTING FUNCTIONS +# =============================================================================== + +def plot_exon_lengths(df_rmats, output_dir, FILEname): + """Generate exon length box plots.""" + df_rmats["regulated_exon_length"] = ( + df_rmats['exonEnd'] - df_rmats['exonStart_0base'] + ) + df_rmats["first_exon_length"] = ( + df_rmats['upstreamEE'] - df_rmats['upstreamES'] + ) + df_rmats["second_exon_length"] = ( + df_rmats['downstreamEE'] - df_rmats['downstreamES'] + ) + df_rmats.loc[df_rmats.strand == '+', 'upstream_exon_length'] = \ + df_rmats["first_exon_length"] + df_rmats.loc[df_rmats.strand == '-', 'upstream_exon_length'] = \ + df_rmats["second_exon_length"] + df_rmats.loc[df_rmats.strand == '+', 'downstream_exon_length'] = \ + df_rmats["second_exon_length"] + df_rmats.loc[df_rmats.strand == '-', 'downstream_exon_length'] = \ + df_rmats["first_exon_length"] + + exon_length_df = df_rmats[[ + "regulated_exon_length", "upstream_exon_length", + "downstream_exon_length", "category" + ]] + exon_length_df = exon_length_df.melt( + id_vars=["category"], var_name="exon_type", value_name="exon_length" + ) + + palette_exon_len = [ + colors_dict['ctrl'], colors_dict['const'], colors_dict['enh'], + colors_dict['enhrest'], colors_dict['sil'], colors_dict['silrest'], + colors_dict['all'] + ] + + sns.set(rc={'figure.figsize': (15, 5)}) + sns.set_style("whitegrid") + g = sns.catplot( + data=exon_length_df, x='category', y='exon_length', col='exon_type', + kind='box', col_wrap=3, showfliers=False, + col_order=["upstream_exon_length", "regulated_exon_length", + "downstream_exon_length"], + order=["control", "constituitive", "enhanced", "enhanced_rest", + "silenced", "silenced_rest"], + palette=palette_exon_len, hue='category', legend=False + ) + titles = ["Upstream Exon", "Middle Exon", "Downstream exon"] + for ax, title in zip(g.axes.flat, titles): + ax.set_title(title) + g.set_xticklabels(rotation=45) + g.set(xlabel=None) + g.axes[0].set_ylabel('Exon length (bp)') + plt.tight_layout() + plt.savefig(f'{output_dir}/{FILEname}_exon_length.pdf') + pbt.helpers.cleanup() + logging.info(f"Saved exon length plot to {output_dir}/{FILEname}_exon_length.pdf") + + +def plot_heatmap(heat_df, exon_categories, window, all_sites, + output_dir, FILEname): + """Generate per-exon heatmap from binary coverage data.""" + # Total exons covered table + grouped_heat_df = heat_df.groupby( + ['exon_id', 'label', 'name'] + )['coverage'].sum().reset_index() + filtered_heat_df = grouped_heat_df[grouped_heat_df['coverage'] > 0] + count_heat_df = filtered_heat_df.groupby( + ['label', 'name'] + )['exon_id'].nunique().reset_index() + count_heat_df.rename(columns={'exon_id': 'exon_count'}, inplace=True) + final_heat_df = count_heat_df.pivot( + index='name', columns='label', values='exon_count' + ).fillna(0).reset_index() + exon_categories_df = exon_categories.reset_index() + exon_categories_df.columns = ['name', 'total_exons_after_subsetting'] + final_heat_df = final_heat_df.merge(exon_categories_df, on='name', how='left') + final_heat_df.to_csv( + f'{output_dir}/{FILEname}_totalExonsCovered.tsv', sep="\t", index=False + ) + + # Binarise and smooth + df = heat_df.copy() + df['coverage'] = (df['coverage'] > 0).astype(int) + df = smooth_coverage(df) + + if not all_sites: + labels = ['upstream_5ss', 'middle_3ss', 'middle_5ss', 'downstream_3ss'] + else: + labels = ['upstream_3ss', 'upstream_5ss', 'middle_3ss', 'middle_5ss', + 'downstream_3ss', 'downstream_5ss'] + + # Total signal per exon + exon_totals = df.groupby('exon_id')['coverage'].sum().reset_index() + exon_totals.columns = ['exon_id', 'total_signal'] + + # Remove exons with no signal + exons_with_signal = exon_totals[exon_totals['total_signal'] > 0]['exon_id'] + df = df[df['exon_id'].isin(exons_with_signal)] + + exon_names = df[['exon_id', 'name']].drop_duplicates( + subset=['exon_id'] + ).set_index('exon_id')['name'] + + label_data = {} + for label in labels: + label_df = df[df['label'] == label] + if len(label_df) == 0: + continue + pivot = label_df.pivot_table( + index='exon_id', columns='position', + values='coverage', fill_value=0 + ) + label_data[label] = pivot + + # Common exons + common_exons = set() + first = True + for label, pivot in label_data.items(): + if first: + common_exons = set(pivot.index) + first = False + else: + common_exons = common_exons.union(set(pivot.index)) + + if len(common_exons) == 0: + logging.info("No exons with signal for heatmap — skipping") + return + + exon_info = exon_totals.set_index('exon_id').loc[list(common_exons)] + exon_info['name'] = exon_names.loc[exon_info.index] + exon_info = exon_info.sort_values(['name', 'total_signal'], ascending=[True, False]) + sorted_exon_ids = exon_info.index.tolist() + + # Set up figure + width = max(15, len(labels) * 4) + height = max(3, len(sorted_exon_ids) * 0.002) + fig = plt.figure(figsize=(width, height)) + fig.patch.set_alpha(0.0) + + gs = GridSpec(1, len(labels) + 1, + width_ratios=[1] + [3] * len(labels), figure=fig) + + # Name labels column + ax_names = fig.add_subplot(gs[0, 0]) + ax_names.patch.set_alpha(0.0) + + names = exon_info['name'].values + unique_names = sorted(set(names)) + color_palette = plt.cm.tab10.colors[:len(unique_names)] + name_colors = {n: color_palette[i] for i, n in enumerate(unique_names)} + + name_matrix = np.zeros((len(sorted_exon_ids), 1)) + name_cmap = LinearSegmentedColormap.from_list( + 'name_cmap', [(1, 1, 1)] + list(color_palette), N=len(unique_names) + 1 + ) + for i, name in enumerate(names): + name_matrix[i, 0] = unique_names.index(name) + 1 + + sns.heatmap(name_matrix, ax=ax_names, cmap=name_cmap, cbar=False, + linewidths=0, rasterized=True) + + # Name group labels + name_groups = {} + current_name = None + start_idx = 0 + for i, name in enumerate(names): + if name != current_name: + if current_name is not None: + name_groups[current_name] = (start_idx, i - 1) + current_name = name + start_idx = i + if current_name is not None: + name_groups[current_name] = (start_idx, len(names) - 1) + + for name, (start, end) in name_groups.items(): + middle = (start + end) / 2 + ax_names.text(0.5, middle, name, + fontsize=10, fontweight='bold', ha='center', va='center', + color='black') + + ax_names.set_title('Name') + ax_names.set_xticks([]) + ax_names.set_yticks([]) + + # Plot each region + for i, label in enumerate(labels): + if label not in label_data: + ax = fig.add_subplot(gs[0, i + 1]) + ax.set_facecolor('none') + ax.text(0.5, 0.5, f"No data for {label}", ha='center', va='center') + ax.set_xticks([]) + ax.set_yticks([]) + ax.set_title(label) + continue - # mdf = mdf.loc[(mdf.label != ".") & (pd.notnull(mdf.label)) & (mdf.label != "None")] - # top_kmers_df = top_kmers_df.loc[(top_kmers_df.label != ".") & (pd.notnull(top_kmers_df.label)) & (top_kmers_df.label != "None")] + pivot = label_data[label] + position_cols = sorted(pivot.columns) - return mdf,top_kmers_df + if '3ss' in label: + min_pos, max_pos = 0, window + 50 + else: + min_pos, max_pos = window - 50, window * 2 + + position_cols = [pos for pos in position_cols if min_pos <= pos <= max_pos] + + display_matrix = np.zeros((len(sorted_exon_ids), len(position_cols))) + for j, exon_id in enumerate(sorted_exon_ids): + if exon_id in pivot.index: + for k, pos in enumerate(position_cols): + if pos in pivot.columns: + display_matrix[j, k] = pivot.loc[exon_id, pos] + + ax = fig.add_subplot(gs[0, i + 1]) + ax.set_facecolor('none') + sns.heatmap(display_matrix, ax=ax, cmap=colormaps['viridis'], + cbar=False, linewidths=0, rasterized=True) + + ax.set_title(label) + ax.set_xticks([]) + ax.set_xticklabels([]) + ax.xaxis.set_visible(False) + ax.set_yticks([]) + + for name, (start, end) in name_groups.items(): + if end < len(sorted_exon_ids) - 1: + ax.axhline(y=end + 1, color='white', linewidth=2, alpha=1) + + plt.tight_layout(rect=[0, 0, 0.95, 0.95]) + plt.savefig(f'{output_dir}/{FILEname}_heatmap.pdf', + dpi=300, bbox_inches='tight') + logging.info(f"Saved heatmap to {output_dir}/{FILEname}_heatmap.pdf") + + +def plot_rna_map(plotting_df, exon_categories, original_counts, + window, all_sites, output_dir, FILEname): + """Generate the main RNA map -log10(pvalue) line plots.""" + sns.set(rc={'figure.figsize': (7, 5)}) + sns.set_style("whitegrid") + + if not all_sites: + col_order = ["upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss"] + titles = ["Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS"] + col_wrap = 4 + else: + col_order = ["upstream_3ss", "upstream_5ss", "middle_3ss", "middle_5ss", + "downstream_3ss", "downstream_5ss"] + titles = ["Upstream 3'SS", "Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", + "Downstream 3'SS", "Downstream 5'SS"] + col_wrap = 6 + + g = sns.relplot( + data=plotting_df, x='position', y='-log10pvalue_smoothed', + hue='name', col='label', facet_kws={"sharex": False}, + kind='line', col_wrap=col_wrap, height=5, aspect=4 / 5, + col_order=col_order + ) + + for ax, title in zip(g.axes.flat, titles): + ax.set_title(title) + ax.axhline(y=0, color='k', alpha=0.2, linewidth=0.5) + fig = plt.gcf() + marker_ax = add_enrichment_marker(fig, ax) + + g.set(xlabel='') + g.axes[0].set_ylabel('-log10(p value) enrichment / control') + + sns.move_legend( + g, "upper right", + bbox_to_anchor=(1, 2), + ncol=1, title=None, frameon=False + ) + leg = g._legend + set_legend_text(leg, exon_categories, original_counts) + + # Exon-intron drawings + rect_fraction = 1 / ((window + 50) / 50) + + for i, ss_type in enumerate(col_order): + ax = g.axes[i] + is_middle = ss_type.startswith('middle_') + exon_color = "midnightblue" if is_middle else "slategrey" + + if ss_type.endswith('_3ss'): + ax.set_xlim([0, window + 50]) + ticks = np.arange(0, window + 51, 50) + labels = ["" if t in (ticks[0], ticks[-1]) + else str(int(t - window)) for t in ticks] + ax.set_xticks(ticks) + ax.set_xticklabels(labels) + + rect = matplotlib.patches.Rectangle( + xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, + color=exon_color, alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + rect = matplotlib.patches.Rectangle( + xy=(0, -0.15), width=1 - rect_fraction, height=.001, + color="slategrey", alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + else: + ax.set_xlim([window - 50, window * 2]) + ticks = np.arange(window - 50, window * 2 + 1, 50) + labels = ["" if t in (ticks[0], ticks[-1]) + else str(int(t - window)) for t in ticks] + ax.set_xticks(ticks) + ax.set_xticklabels(labels) + + rect = matplotlib.patches.Rectangle( + xy=(0, -0.2), width=rect_fraction, height=.1, + color=exon_color, alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + rect = matplotlib.patches.Rectangle( + xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, + color="slategrey", alpha=1, + transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + + plt.subplots_adjust(wspace=0.05) + plt.savefig( + f'{output_dir}/{FILEname}_RNAmap_-log10pvalue.pdf', + bbox_extra_artists=([leg, rect, marker_ax]), + bbox_inches='tight', pad_inches=0.8 + ) + logging.info(f"Saved RNA map to {output_dir}/{FILEname}_RNAmap_-log10pvalue.pdf") + pbt.helpers.cleanup() + + +def plot_multivalency(middle_3ss_bed, middle_5ss_bed, downstream_3ss_bed, + upstream_5ss_bed, downstream_5ss_bed, upstream_3ss_bed, + fai, window, genome_fasta, output_dir, FILEname, + germsdir, all_sites, exon_categories, original_counts): + """Run multivalency analysis and generate plots.""" + rect_fraction = 1 / ((window + 50) / 50) + + middle_3ss_mdf = get_multivalency_scores( + middle_3ss_bed, fai, window, genome_fasta, output_dir, + FILEname, 'middle_3ss', germsdir) + middle_5ss_mdf = get_multivalency_scores( + middle_5ss_bed, fai, window, genome_fasta, output_dir, + FILEname, 'middle_5ss', germsdir) + downstream_3ss_mdf = get_multivalency_scores( + downstream_3ss_bed, fai, window, genome_fasta, output_dir, + FILEname, 'downstream_3ss', germsdir) + upstream_5ss_mdf = get_multivalency_scores( + upstream_5ss_bed, fai, window, genome_fasta, output_dir, + FILEname, 'upstream_5ss', germsdir) + + if all_sites: + downstream_5ss_mdf = get_multivalency_scores( + downstream_5ss_bed, fai, window, genome_fasta, output_dir, + FILEname, 'downstream_5ss', germsdir) + upstream_3ss_mdf = get_multivalency_scores( + upstream_3ss_bed, fai, window, genome_fasta, output_dir, + FILEname, 'upstream_3ss', germsdir) + + a = middle_3ss_mdf[0] + b = middle_5ss_mdf[0] + c = downstream_3ss_mdf[0] + f = upstream_5ss_mdf[0] + + if not all_sites: + plotting_df = pd.concat([f, a, b, c]) + mv_col_order = ["upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss"] + mv_titles = ["Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS"] + mv_col_wrap = 4 + else: + d = downstream_5ss_mdf[0] + e = upstream_3ss_mdf[0] + plotting_df = pd.concat([a, b, c, d, e, f]) + mv_col_order = ["upstream_3ss", "upstream_5ss", "middle_3ss", "middle_5ss", + "downstream_3ss", "downstream_5ss"] + mv_titles = ["Upstream 3'SS", "Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", + "Downstream 3'SS", "Downstream 5'SS"] + mv_col_wrap = 6 + + plotting_df.to_csv( + f'{output_dir}/{FILEname}_RNAmap_multivalency.tsv', sep="\t" + ) + logging.info(f"Saved multivalency TSV") + + # --- Main multivalency plot --- + plt.figure() + sns.set_style("whitegrid") + + g = sns.relplot( + data=plotting_df, x='position', y='smoothed_kmer_multivalency', + hue='exon_type', col='type', facet_kws={"sharex": False}, + kind='line', col_wrap=mv_col_wrap, height=5, aspect=3.5 / 5, + errorbar=None, col_order=mv_col_order + ) + + for ax, title in zip(g.axes.flat, mv_titles): + ax.set_title(title) + g.set(xlabel='') + g.axes[0].set_ylabel('mean smoothed kmer multivalency') + leg = g._legend + set_legend_text(leg, exon_categories, original_counts) + + _apply_mv_axis_formatting(g, mv_col_order, window, rect_fraction) + + plt.subplots_adjust(wspace=0.01) + plt.savefig( + f'{output_dir}/{FILEname}_RNAmap_multivalency.pdf', + bbox_extra_artists=([leg, rect_fraction]), + bbox_inches='tight', pad_inches=0.5 + ) + pbt.helpers.cleanup() + logging.info(f"Saved multivalency plot") + + # --- Kmer plots --- + a = middle_3ss_mdf[1] + b = middle_5ss_mdf[1] + c = downstream_3ss_mdf[1] + f = upstream_5ss_mdf[1] + + if not all_sites: + plotting_df = pd.concat([f, a, b, c]) + else: + d = downstream_5ss_mdf[1] + e = upstream_3ss_mdf[1] + plotting_df = pd.concat([a, b, c, d, e, f]) + + plotting_df.to_csv( + f'{output_dir}/{FILEname}_RNAmap_TOP10KMER_multivalency.tsv', sep="\t" + ) + + # Silenced kmer plot + _plot_kmer_multivalency( + plotting_df[plotting_df['exon_type'] == 'silenced'], + mv_col_order, mv_titles, mv_col_wrap, window, rect_fraction, + output_dir, FILEname, 'silenced' + ) + + # Enhanced kmer plot + _plot_kmer_multivalency( + plotting_df[plotting_df['exon_type'] == 'enhanced'], + mv_col_order, mv_titles, mv_col_wrap, window, rect_fraction, + output_dir, FILEname, 'enhanced' + ) + + +def _apply_mv_axis_formatting(g, mv_col_order, window, rect_fraction): + """Apply exon-intron axis formatting for multivalency plots.""" + for i, ss_type in enumerate(mv_col_order): + ax = g.axes[i] + if i == 0: + ax.set_ylim(ymin=1) + + is_middle = ss_type.startswith('middle_') + exon_color = "midnightblue" if is_middle else "slategrey" + + if ss_type.endswith('_3ss'): + ax.set_xlim([window, (2 * window) + 50]) + ticks = np.arange(window, 2 * window + 50, 50) + labels = ["" if t == ticks[0] else str(int(t - window)) for t in ticks] + ax.set_xticks(ticks) + ax.set_xticklabels(labels) + rect = matplotlib.patches.Rectangle( + xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, + color=exon_color, alpha=1, transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + rect = matplotlib.patches.Rectangle( + xy=(0, -0.15), width=1 - rect_fraction, height=.001, + color="slategrey", alpha=1, transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + else: + ax.set_xlim([2 * window - 50, 3 * window]) + ticks = np.arange(2 * window - 50, 3 * window, 50) + labels = ["" if t == ticks[0] else str(int(t - 2 * window)) for t in ticks] + ax.set_xticks(ticks) + ax.set_xticklabels(labels) + rect = matplotlib.patches.Rectangle( + xy=(0, -0.2), width=rect_fraction, height=.1, + color=exon_color, alpha=1, transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + rect = matplotlib.patches.Rectangle( + xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, + color="slategrey", alpha=1, transform=ax.transAxes, clip_on=False) + ax.add_artist(rect) + + +def _plot_kmer_multivalency(filtered_df, mv_col_order, mv_titles, + mv_col_wrap, window, rect_fraction, + output_dir, FILEname, exon_type): + """Plot kmer multivalency for a given exon type.""" + plt.figure() + sns.set_style("whitegrid") + g = sns.relplot( + data=filtered_df, x='position', y='smoothed_kmer_multivalency', + hue='kmer', col='type', facet_kws={"sharex": False}, + kind='line', col_wrap=mv_col_wrap, height=5, errorbar=None, + aspect=3.5 / 5, col_order=mv_col_order + ) + for ax, title in zip(g.axes.flat, mv_titles): + ax.set_title(title) + g.set(xlabel='') + g.axes[0].set_ylabel('mean smoothed kmer multivalency') + leg = g._legend + + _apply_mv_axis_formatting(g, mv_col_order, window, rect_fraction) + + plt.subplots_adjust(wspace=0.01) + plt.savefig( + f'{output_dir}/{FILEname}_RNAmap_{exon_type}KMER_multivalency.pdf', + bbox_extra_artists=([leg]), + bbox_inches='tight', pad_inches=0.5 + ) + pbt.helpers.cleanup() + logging.info(f"Saved {exon_type} kmer multivalency plot") + + +# =============================================================================== +# MAIN PIPELINE +# =============================================================================== + +def run_rna_map(args): + """ + Main RNA map pipeline. Handles both input modes with shared downstream logic. + """ + output_dir = args.outputpath + os.makedirs(output_dir, exist_ok=True) -def run_rna_map(de_file, xl_bed, genome_fasta, fai, window, smoothing, - min_ctrl, max_ctrl, max_inclusion, max_fdr, max_enh, min_sil, output_dir, multivalency, germsdir, no_constitutive, no_subset, all_sites, prefix - #n_exons = 150, n_samples = 300, z_test=False - ): - FILEname = prefix + "_" + de_file.split('/')[-1].replace('.txt', '').replace('.gz', '') - df_fai = pd.read_csv(fai, sep='\t', header=None) - chroms = set(df_fai[0].values) - rmats = pd.read_csv(de_file, sep='\t') + log_filename, start_time, logger = setup_logging(output_dir) + logging.info(f"Log file created: {log_filename}") + logging.info(f"Arguments: {args}") + + try: + # Set random seed for reproducibility + np.random.seed(args.seed) + logging.info(f"Random seed set to {args.seed}") + + # Load chromosome list + df_fai = pd.read_csv(args.fastaindex, sep='\t', header=None) + chroms = set(df_fai[0].values) + + # ============================================================== + # MODE SELECTION: Two tracks, one output format + # ============================================================== + + if args.inputsplice: + # ----- TRACK 1: rMATS ----- + input_mode = 'rmats' + df_rmats = load_rmats_data( + args.inputsplice, + args.minctrl, args.maxctrl, args.maxincl, + args.maxfdr, args.maxenh, args.minsil, + chroms, args.no_constitutive + ) + + if args.prefix: + FILEname = (args.prefix + "_" + + args.inputsplice.split('/')[-1] + .replace('.txt', '').replace('.gz', '')) + else: + FILEname = (args.inputsplice.split('/')[-1] + .replace('.txt', '').replace('.gz', '')) - if 'exonStart_0base' in rmats.columns: - rmats = rmats[rmats['chr'].isin(chroms)] - rmats['inclusion'] = (rmats.IncLevel1.str.split(',') + rmats.IncLevel2.str.split(',')) - # code below for mean inclusion - # rmats['inclusion'] = rmats['inclusion'].apply(lambda x: sum([float(y) for y in x if y != 'NA']) / len(x)) - # replaces with max inclusion - rmats['inclusion'] = rmats['inclusion'].apply(lambda x: max([float(y) for y in x if y != 'NA'])) - df_rmats = rmats.loc[ : ,['chr', 'exonStart_0base', 'exonEnd', 'FDR', 'IncLevelDifference', 'strand', 'inclusion', - 'upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE']].rename( - columns={'IncLevelDifference': 'dPSI', 'inclusion':'maxPSI'}).reset_index() - - - # to deduplicate, first select the most extreme dPSI value for every exon (keep ties, they will be resolved by the hierarchy) - mask = df_rmats.groupby(['chr', 'exonStart_0base', 'exonEnd', 'strand'])['dPSI'] \ - .transform(lambda x: abs(x).rank(ascending=False)) < 2 - df_rmats = df_rmats[mask] - - - # then apply hierarchy to decide which category exons belong to - if no_constitutive: - conditions = [ - (df_rmats["dPSI"].gt(min_sil) & df_rmats["FDR"].lt(max_fdr)), # silenced - (df_rmats["dPSI"].lt(max_enh) & df_rmats["FDR"].lt(max_fdr)), # enhanced - (df_rmats["dPSI"].gt(min_ctrl) & df_rmats["dPSI"].lt(max_ctrl))# control - ] - choices = ["silenced", "enhanced", "control"] else: - conditions = [ - (df_rmats["dPSI"].gt(min_sil) & df_rmats["FDR"].lt(max_fdr)), # silenced - (df_rmats["dPSI"].lt(max_enh) & df_rmats["FDR"].lt(max_fdr)), # enhanced - (df_rmats["dPSI"].gt(min_ctrl) & df_rmats["dPSI"].lt(max_ctrl) & df_rmats["maxPSI"].gt(max_inclusion)), # constituitive - (df_rmats["dPSI"].gt(min_ctrl) & df_rmats["dPSI"].lt(max_ctrl))# control - ] - choices = ["silenced", "enhanced", "constituitive", "control"] + # ----- TRACK 2: VastDB ----- + input_mode = 'vastdb' + df_rmats = load_vastdb_data( + args.vastdb_enhanced, + args.vastdb_silenced, + args.vastdb_control, + args.vastdb_constitutive, + args.vastdb_annotation, + chroms + ) - df_rmats["category"] = np.select(conditions, choices, default=None) + if args.prefix: + FILEname = args.prefix + else: + FILEname = "VastDB_RNAmap" + + # ============================================================== + # SHARED PIPELINE: Same for both modes from here on + # ============================================================== + + # Filter to valid chromosomes + df_rmats = df_rmats[df_rmats['chr'].isin(chroms)] - df_rmats.to_csv(f'{output_dir}/{FILEname}_RMATS_with_categories.tsv', sep="\t") + # Remove exons with missing flanking coordinates + logging.info("\nFiltering exons with complete flanking coordinates...") + before = len(df_rmats) + df_rmats = df_rmats.dropna( + subset=['upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE'] + ) + after = len(df_rmats) + logging.info(f"Removed {before - after} exons with missing flanking coordinates") + logging.info(f"Remaining: {after} exons") exon_categories = df_rmats.groupby('category').size() - #exon_categories.columns = ["name", "exon_number"] - logging.info("Exons in each category:") + logging.info("\nExons in each category:") logging.info(exon_categories) - logging.info("Total categorised deduplicated exons: " + str(df_rmats.shape[0])) - - ### Some warning messages ### - if exon_categories.loc["control"] == 0: - logging.info("Warning! There are no control exons. Try changing thresholds or input file and run again.") - sys.exit() - - if exon_categories.loc["enhanced"] == 0 and exon_categories.loc["silenced"] == 0: - logging.info('Warning! There are no regulated exons, try changing filtering parameters or file and run again.') - sys.exit() - - # Apply subsetting for the plotting (unless disabled with --no_subset) - if not no_subset: - # Get the count of each category - category_counts = df_rmats['category'].value_counts() - # Store original counts - original_counts = {cat: count for cat, count in category_counts.items()} - - # Find the maximum count of enhanced or silenced - target_count = 0 - if 'enhanced' in category_counts and 'silenced' in category_counts: - target_count = max(category_counts['enhanced'], category_counts['silenced']) - elif 'enhanced' in category_counts: - target_count = category_counts['enhanced'] - elif 'silenced' in category_counts: - target_count = category_counts['silenced'] - - # Subset control exons - if 'control' in category_counts and category_counts['control'] > target_count > 0: - control_indices = df_rmats[df_rmats['category'] == 'control'].index - # Randomly select indices to keep - control_indices_to_keep = np.random.choice(control_indices, target_count, replace=False) - # Create a mask for rows to drop - drop_mask = df_rmats.index.isin(control_indices) & ~df_rmats.index.isin(control_indices_to_keep) - # Drop the rows - df_rmats = df_rmats[~drop_mask] - logging.info(f"Randomly subsetted control exons from {category_counts['control']} to {target_count}") - - # Subset constitutive exons if they exist and not excluded - if not no_constitutive and 'constituitive' in category_counts and category_counts['constituitive'] > target_count > 0: - const_indices = df_rmats[df_rmats['category'] == 'constituitive'].index - # Randomly select indices to keep - const_indices_to_keep = np.random.choice(const_indices, target_count, replace=False) - # Create a mask for rows to drop - drop_mask = df_rmats.index.isin(const_indices) & ~df_rmats.index.isin(const_indices_to_keep) - # Drop the rows - df_rmats = df_rmats[~drop_mask] - logging.info(f"Randomly subsetted constitutive exons from {category_counts['constituitive']} to {target_count}") + + # Validate categories + if "control" not in exon_categories or exon_categories.loc["control"] == 0: + logging.error("No control exons found!") + sys.exit(1) + + if ("enhanced" not in exon_categories + and "silenced" not in exon_categories): + logging.error("No regulated exons found!") + sys.exit(1) + + # Apply subsetting + if not args.no_subset: + df_rmats, original_counts = apply_subsetting( + df_rmats, args.no_constitutive + ) else: - logging.info("Subsetting disabled (--no_subset flag provided)") - # Still need original_counts for legend (no subsetting means original = current) + logging.info("Subsetting disabled (--no_subset flag)") category_counts = df_rmats['category'].value_counts() original_counts = {cat: count for cat, count in category_counts.items()} exon_categories = df_rmats.groupby('category').size() - ####### Exon lengths ####### - df_rmats["regulated_exon_length"] = df_rmats['exonEnd'] - df_rmats['exonStart_0base'] - df_rmats["upstream_exon_length"] = df_rmats['upstreamEE'] - df_rmats['upstreamES'] - df_rmats["downstream_exon_length"] = df_rmats['downstreamEE'] - df_rmats['downstreamES'] - - exon_length_df = df_rmats[["regulated_exon_length","upstream_exon_length","downstream_exon_length","category"]] - - exon_length_df = exon_length_df .melt(id_vars=["category"], var_name="exon_type", value_name="exon_length") - - palette_exon_len = [colors_dict['ctrl'], colors_dict['const'], colors_dict['enh'], - colors_dict['enhrest'], colors_dict['sil'], colors_dict['silrest'], colors_dict['all']] - - sns.set(rc={'figure.figsize':(15, 5)}) - sns.set_style("whitegrid") - g = sns.catplot(data=exon_length_df, x='category', y='exon_length',col='exon_type', - kind='box', col_wrap=3, showfliers=False, - col_order=["upstream_exon_length","regulated_exon_length","downstream_exon_length"], - order=["control","constituitive","enhanced","enhanced_rest","silenced","silenced_rest"], - palette = palette_exon_len, hue='category', legend=False) - titles = ["Upstream Exon", "Middle Exon", "Downstream exon"] - for ax, title in zip(g.axes.flat, titles): - ax.set_title(title) - g.set_xticklabels(rotation=45) - g.set(xlabel=None) - g.axes[0].set_ylabel('Exon length (bp)') - plt.tight_layout() - plt.savefig(f'{output_dir}/{FILEname}_exon_length.pdf') - pbt.helpers.cleanup() - - - ### The coverage plot ### - - middle_3ss_bed = get_ss_bed(df_rmats,'exonStart_0base','exonEnd') - middle_5ss_bed = get_ss_bed(df_rmats,'exonEnd','exonStart_0base') - downstream_3ss_bed = get_ss_bed(df_rmats,'downstreamES','downstreamEE') - upstream_5ss_bed = get_ss_bed(df_rmats,'upstreamEE','upstreamES') - if all_sites: - downstream_5ss_bed = get_ss_bed(df_rmats,'downstreamEE','downstreamES') - upstream_3ss_bed = get_ss_bed(df_rmats,'upstreamES','upstreamEE') - - if xl_bed is not None: - middle_3ss = get_coverage_plot(xl_bed, middle_3ss_bed, fai, window, exon_categories, 'middle_3ss') - middle_5ss = get_coverage_plot(xl_bed, middle_5ss_bed, fai, window, exon_categories, 'middle_5ss') - downstream_3ss = get_coverage_plot(xl_bed, downstream_3ss_bed, fai, window, exon_categories, 'downstream_3ss') - upstream_5ss = get_coverage_plot(xl_bed, upstream_5ss_bed, fai, window, exon_categories, 'upstream_5ss') - if all_sites: - downstream_5ss = get_coverage_plot(xl_bed, downstream_5ss_bed, fai, window, exon_categories, 'downstream_5ss') - upstream_3ss = get_coverage_plot(xl_bed, upstream_3ss_bed, fai, window, exon_categories, 'upstream_3ss') + # Save categorised exons + if input_mode == 'rmats': + save_cols = ['chr', 'exonStart_0base', 'exonEnd', 'strand', 'category', + 'FDR', 'dPSI', 'maxPSI', + 'upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE'] + save_cols = [c for c in save_cols if c in df_rmats.columns] + suffix = '_RMATS_with_categories.tsv' + else: + save_cols = ['chr', 'exonStart_0base', 'exonEnd', 'strand', 'category', + 'EVENT', 'GENE', + 'upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE'] + save_cols = [c for c in save_cols if c in df_rmats.columns] + suffix = '_VastDB_with_categories.tsv' + + df_rmats[save_cols].to_csv( + f'{output_dir}/{FILEname}{suffix}', sep="\t", index=False + ) + logging.info(f"Saved categorised exons to {output_dir}/{FILEname}{suffix}") + + # Exon length plots + plot_exon_lengths(df_rmats.copy(), output_dir, FILEname) + + # ============================================================== + # CREATE BED FILES FOR 6 SPLICE SITE REGIONS + # ============================================================== + # After strand correction at load time, upstream/downstream are + # always in transcript order. Same-exon pairing is correct for + # both modes. + # ============================================================== + logging.info("\n" + "=" * 60) + logging.info("CREATING BED FILES FOR SPLICE SITES") + logging.info("=" * 60) + + middle_3ss_bed = get_ss_bed(df_rmats, 'exonStart_0base', 'exonEnd') + middle_5ss_bed = get_ss_bed(df_rmats, 'exonEnd', 'exonStart_0base') + downstream_3ss_bed = get_ss_bed(df_rmats, 'downstreamES', 'downstreamEE') + upstream_5ss_bed = get_ss_bed(df_rmats, 'upstreamEE', 'upstreamES') + + downstream_5ss_bed = None + upstream_3ss_bed = None + if args.all_sites: + downstream_5ss_bed = get_ss_bed(df_rmats, 'downstreamEE', 'downstreamES') + upstream_3ss_bed = get_ss_bed(df_rmats, 'upstreamES', 'upstreamEE') + + # ============================================================== + # CALCULATE COVERAGE + # ============================================================== + if args.inputxlsites is not None: + logging.info("\n" + "=" * 60) + logging.info("CALCULATING COVERAGE") + logging.info("=" * 60) + + fai = args.fastaindex + xl_bed = args.inputxlsites + window = args.window + smoothing = args.smoothing + + middle_3ss = get_coverage_plot( + xl_bed, middle_3ss_bed, fai, window, exon_categories, + 'middle_3ss', smoothing) + middle_5ss = get_coverage_plot( + xl_bed, middle_5ss_bed, fai, window, exon_categories, + 'middle_5ss', smoothing) + downstream_3ss = get_coverage_plot( + xl_bed, downstream_3ss_bed, fai, window, exon_categories, + 'downstream_3ss', smoothing) + upstream_5ss = get_coverage_plot( + xl_bed, upstream_5ss_bed, fai, window, exon_categories, + 'upstream_5ss', smoothing) linegraph_middle_3ss = middle_3ss[0] linegraph_middle_5ss = middle_5ss[0] @@ -563,636 +1382,220 @@ def run_rna_map(de_file, xl_bed, genome_fasta, fai, window, smoothing, heatmap_downstream_3ss = downstream_3ss[1] heatmap_upstream_5ss = upstream_5ss[1] - if not all_sites: - plotting_df = pd.concat([linegraph_upstream_5ss, linegraph_middle_3ss, linegraph_middle_5ss, linegraph_downstream_3ss]) - heat_df = pd.concat([heatmap_upstream_5ss, heatmap_middle_3ss, heatmap_middle_5ss, heatmap_downstream_3ss]) + if not args.all_sites: + plotting_df = pd.concat([ + linegraph_upstream_5ss, linegraph_middle_3ss, + linegraph_middle_5ss, linegraph_downstream_3ss + ]) + heat_df = pd.concat([ + heatmap_upstream_5ss, heatmap_middle_3ss, + heatmap_middle_5ss, heatmap_downstream_3ss + ]) else: + downstream_5ss = get_coverage_plot( + xl_bed, downstream_5ss_bed, fai, window, + exon_categories, 'downstream_5ss', smoothing) + upstream_3ss = get_coverage_plot( + xl_bed, upstream_3ss_bed, fai, window, + exon_categories, 'upstream_3ss', smoothing) + linegraph_downstream_5ss = downstream_5ss[0] linegraph_upstream_3ss = upstream_3ss[0] heatmap_downstream_5ss = downstream_5ss[1] heatmap_upstream_3ss = upstream_3ss[1] - plotting_df = pd.concat([linegraph_middle_3ss, linegraph_middle_5ss, linegraph_downstream_3ss, linegraph_downstream_5ss, linegraph_upstream_3ss, linegraph_upstream_5ss]) - heat_df = pd.concat([heatmap_middle_3ss, heatmap_middle_5ss, heatmap_downstream_3ss, heatmap_downstream_5ss, heatmap_upstream_3ss, heatmap_upstream_5ss]) - - plotting_df.to_csv(f'{output_dir}/{FILEname}_RNAmap.tsv', sep="\t") - - # Making an output table with total exons covered in each region and category - # Step 1: Group by exon_id, label, and name, then sum the coverage - grouped_heat_df = heat_df.groupby(['exon_id', 'label', 'name'])['coverage'].sum().reset_index() - # Step 2: Filter for exons with >0 total coverage - filtered_heat_df = grouped_heat_df[grouped_heat_df['coverage'] > 0] - # Step 3: Count unique exon_ids for each label and name combination - count_heat_df = filtered_heat_df.groupby(['label', 'name'])['exon_id'].nunique().reset_index() - count_heat_df.rename(columns={'exon_id': 'exon_count'}, inplace=True) - # Step 4: Pivot to wide format with label as columns - final_heat_df = count_heat_df.pivot(index='name', columns='label', values='exon_count').fillna(0) - final_heat_df = final_heat_df.reset_index() - exon_categories_df = exon_categories.reset_index() - exon_categories_df.columns = ['name', 'total_exons_after_subsetting'] - final_heat_df = final_heat_df.merge(exon_categories_df, on='name', how='left') - final_heat_df.to_csv(f'{output_dir}/{FILEname}_totalExonsCovered.tsv', sep="\t", index=False) - - # Step 1: Ensure coverage is binary (0 or 1) - df = heat_df.copy() - df['coverage'] = (df['coverage'] > 0).astype(int) - df = smooth_coverage(df) - - if not all_sites: - labels = ['upstream_5ss', 'middle_3ss', 'middle_5ss', 'downstream_3ss'] - else: - labels = ['upstream_3ss', 'upstream_5ss', 'middle_3ss', 'middle_5ss', 'downstream_3ss', 'downstream_5ss'] - - # Step 2: Calculate total signal for each exon across ALL regions - exon_totals = df.groupby('exon_id')['coverage'].sum().reset_index() - exon_totals.columns = ['exon_id', 'total_signal'] - - # Step 3: Remove exons with no signal across all regions - exons_with_signal = exon_totals[exon_totals['total_signal'] > 0]['exon_id'] - df = df[df['exon_id'].isin(exons_with_signal)] - - # Step 4: Get name for each exon - keep as Series with exon_id as index - exon_names = df[['exon_id', 'name']].drop_duplicates(subset=['exon_id']).set_index('exon_id')['name'] - - # Step 5: Create dictionary to store data for each label - label_data = {} - - # Process each label - for label in labels: - label_df = df[df['label'] == label] - # Skip if no data for this label - if len(label_df) == 0: - print(f"No data for label: {label}") - continue - - # Create pivot table for this label - keep exon_id as index - pivot = label_df.pivot_table( - index='exon_id', - columns='position', - values='coverage', - fill_value=0 - ) - - # Add to dictionary - label_data[label] = pivot - - # Step 6: Get common exon_ids across all labels with data - common_exons = set() - first = True - - for label, pivot in label_data.items(): - if first: - common_exons = set(pivot.index) - first = False - else: - common_exons = common_exons.union(set(pivot.index)) - - # Step 7: Get names and total signal for common exons - # Create DataFrame with exon_id and total_signal - keep exon_id as index - exon_info = exon_totals.set_index('exon_id').loc[list(common_exons)] - - # Add name information - exon_info['name'] = exon_names.loc[exon_info.index] - - # Step 8: Sort exons - first by name, then by total signal (descending) - exon_info = exon_info.sort_values(['name', 'total_signal'], ascending=[True, False]) - - # Get the sorted exon IDs - sorted_exon_ids = exon_info.index.tolist() - - # Step 9: Set up the figure - width = max(15, len(labels) * 4) - height = max(3, len(sorted_exon_ids) * 0.002) - figsize = (width, height) - - fig = plt.figure(figsize=figsize) - fig.patch.set_alpha(0.0) # Make figure background fully transparent - - # Create grid for the subplots - gs = GridSpec(1, len(labels) + 1, width_ratios=[1] + [3] * len(labels), figure=fig) - - # Create the name labels column - ax_names = fig.add_subplot(gs[0, 0]) - ax_names.patch.set_alpha(0.0) # Make axis background transparent - - # Step 10: Plot name groups and labels - # Get names in the sorted order - names = exon_info['name'].values - - # Create a color-coded name column - name_colors = {} - unique_names = sorted(set(names)) - color_palette = plt.cm.tab10.colors[:len(unique_names)] - for i, name in enumerate(unique_names): - name_colors[name] = color_palette[i] - - # Create name matrix for display - name_matrix = np.zeros((len(sorted_exon_ids), 1)) - name_cmap = LinearSegmentedColormap.from_list('name_cmap', - [(1, 1, 1)] + list(color_palette), - N=len(unique_names) + 1) - - for i, name in enumerate(names): - name_matrix[i, 0] = unique_names.index(name) + 1 - - # Plot name matrix - sns.heatmap(name_matrix, ax=ax_names, cmap=name_cmap, cbar=False, linewidths=0, rasterized=True) - - # Add name labels - name_groups = {} - current_name = None - start_idx = 0 - - for i, name in enumerate(names): - if name != current_name: - if current_name is not None: - name_groups[current_name] = (start_idx, i - 1) - current_name = name - start_idx = i - - # Add the last group - if current_name is not None: - name_groups[current_name] = (start_idx, len(names) - 1) - - # Add text labels for each name group - for name, (start, end) in name_groups.items(): - middle = (start + end) / 2 - ax_names.text(0.5, middle, name, - fontsize=10, fontweight='bold', ha='center', va='center', - color='black') - - # Format the name axis - ax_names.set_title('Name') - ax_names.set_xticks([]) - ax_names.set_yticks([]) - ax.yaxis.set_visible(False) - - # Step 11: Plot each region in a separate subplot - for i, label in enumerate(labels): - if label not in label_data: - # Create empty subplot if no data - ax = fig.add_subplot(gs[0, i + 1]) - ax.set_facecolor('none') - ax.text(0.5, 0.5, f"No data for {label}", ha='center', va='center') - ax.set_xticks([]) - ax.set_yticks([]) - ax.set_title(label) - continue - - # Get data for this label - pivot = label_data[label] - - # Get position columns - position_cols = sorted(pivot.columns) - - # Set different position limits based on label - if '3ss' in label: - min_pos = 0 - max_pos = window+50 - else: - min_pos = window-50 - max_pos = window*2 - - # Filter position columns based on limits - position_cols = [pos for pos in position_cols if min_pos <= pos <= max_pos] - - # Create matrix for display (with rows in the correct order) - display_matrix = np.zeros((len(sorted_exon_ids), len(position_cols))) - - # Fill the matrix for exons that have data for this label - for j, exon_id in enumerate(sorted_exon_ids): - if exon_id in pivot.index: - for k, pos in enumerate(position_cols): - if pos in pivot.columns: - display_matrix[j, k] = pivot.loc[exon_id, pos] - - # Create the heatmap for this label - ax = fig.add_subplot(gs[0, i + 1]) - ax.set_facecolor('none') - sns.heatmap(display_matrix, ax=ax, cmap=colormaps['viridis'], cbar=False, linewidths=0, rasterized=True) - # Apply rasterization to the specific artist - # Find the right collection (typically the first one) - # if len(ax.collections) > 0: - # ax.collections[0].set_rasterized(True) - - # Format the axis - ax.set_title(label) - - # REMOVE X-AXIS LABELS COMPLETELY - ax.set_xticks([]) # Remove tick marks - ax.set_xticklabels([]) # Remove tick labels - ax.xaxis.set_visible(False) # Completely hide x-axis - - # Only show y-ticks for the first label to avoid redundancy - ax.set_yticks([]) - - # Add horizontal lines to separate different name groups - for name, (start, end) in name_groups.items(): - if end < len(sorted_exon_ids) - 1: # Don't add a line after the last group - ax.axhline(y=end + 1, color='white', linewidth=2, alpha=1) - - plt.tight_layout(rect=[0, 0, 0.95, 0.95]) - - # Save the figure if requested - plt.savefig(f'{output_dir}/{FILEname}_heatmap.pdf', dpi=300, bbox_inches='tight') - - - # LINE GRAPH - #sns.set(rc={'figure.figsize':(7, 5)}) - sns.set_style("whitegrid") - - if not all_sites: - col_order = ["upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss"] - titles = ["Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS"] - col_wrap = 4 - else: - col_order = ["upstream_3ss", "upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss", "downstream_5ss"] - titles = ["Upstream 3'SS", "Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS", "Downstream 5'SS"] - col_wrap = 6 - g = sns.relplot(data=plotting_df, x='position', y='-log10pvalue_smoothed', hue='name', col='label', facet_kws={"sharex":False}, - kind='line', col_wrap=col_wrap, height=5, aspect=4/5, - col_order=col_order) - for ax, title in zip(g.axes.flat, titles): - ax.set_title(title) - ax.axhline(y=0, color='k', alpha=0.2, linewidth=0.5) - fig = plt.gcf() - marker_ax = add_enrichment_marker(fig, ax) - - g.set(xlabel='') - g.axes[0].set_ylabel('-log10(p value) enrichment / control') - - sns.move_legend( - g, "upper right", # Position: 'upper left' relative to bbox - bbox_to_anchor=(1, 2), # Outside the plot area to the right - ncol=1, # Number of columns in the legend - title=None, # Set title if needed - frameon=False # Remove frame around the legend - ) - leg = g._legend - set_legend_text(leg, exon_categories, original_counts) - - - ### Add exon-intron drawing below line plots ### - # for calculating rectangle size - rect_fraction = 1 / ((window + 50) / 50) - - # Configure each axis based on splice site type in col_order - for i, ss_type in enumerate(col_order): - ax = g.axes[i] - - # Determine if this is a middle exon (use different color) - is_middle = ss_type.startswith('middle_') - exon_color = "midnightblue" if is_middle else "slategrey" - - if ss_type.endswith('_3ss'): - # 3' splice site: exon on right side - ax.set_xlim([0, window + 50]) - ticks = np.arange(0, window + 51, 50) - labels = ["" if t in (ticks[0], ticks[-1]) else str(int(t - window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - - # Exon rectangle on right - rect = matplotlib.patches.Rectangle( - xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - # Intron line on left - rect = matplotlib.patches.Rectangle( - xy=(0, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - else: - # 5' splice site: exon on left side - ax.set_xlim([window - 50, window * 2]) - ticks = np.arange(window - 50, window * 2 + 1, 50) - labels = ["" if t in (ticks[0], ticks[-1]) else str(int(t - window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - - # Exon rectangle on left - rect = matplotlib.patches.Rectangle( - xy=(0, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - # Intron line on right - rect = matplotlib.patches.Rectangle( - xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - - # Adjust subplot spacing and save with enough room for legend and arrows - plt.subplots_adjust(wspace=0.05) - plt.savefig(f'{output_dir}/{FILEname}_RNAmap_-log10pvalue.pdf', - bbox_extra_artists=([leg, rect, marker_ax]), - bbox_inches='tight', - pad_inches=0.8) - pbt.helpers.cleanup() - - if multivalency: - ### Get multivalency scores ### - rect_fraction = 1 / ((window + 50) / 50) - - middle_3ss_mdf = get_multivalency_scores(middle_3ss_bed, fai, window, genome_fasta, output_dir, FILEname, 'middle_3ss',germsdir) - middle_5ss_mdf = get_multivalency_scores(middle_5ss_bed, fai, window, genome_fasta, output_dir, FILEname, 'middle_5ss',germsdir) - downstream_3ss_mdf = get_multivalency_scores(downstream_3ss_bed, fai, window, genome_fasta, output_dir, FILEname, 'downstream_3ss',germsdir) - upstream_5ss_mdf = get_multivalency_scores(upstream_5ss_bed, fai, window, genome_fasta, output_dir, FILEname, 'upstream_5ss',germsdir) - if all_sites: - downstream_5ss_mdf = get_multivalency_scores(downstream_5ss_bed, fai, window, genome_fasta, output_dir, FILEname, 'downstream_5ss',germsdir) - upstream_3ss_mdf = get_multivalency_scores(upstream_3ss_bed, fai, window, genome_fasta, output_dir, FILEname, 'upstream_3ss',germsdir) - - a = middle_3ss_mdf[0] - b = middle_5ss_mdf[0] - c = downstream_3ss_mdf[0] - f = upstream_5ss_mdf[0] - - if not all_sites: - plotting_df = pd.concat([f, a, b, c]) - else: - d = downstream_5ss_mdf[0] - e = upstream_3ss_mdf[0] - plotting_df = pd.concat([a, b, c, d, e, f]) - plotting_df.to_csv(f'{output_dir}/{FILEname}_RNAmap_multivalency.tsv', sep="\t") - logging.info(f'{output_dir}/{FILEname}_RNAmap_multivalency.tsv written to file successfully') - - plt.figure() - sns.set_style("whitegrid") + plotting_df = pd.concat([ + linegraph_middle_3ss, linegraph_middle_5ss, + linegraph_downstream_3ss, linegraph_downstream_5ss, + linegraph_upstream_3ss, linegraph_upstream_5ss + ]) + heat_df = pd.concat([ + heatmap_middle_3ss, heatmap_middle_5ss, + heatmap_downstream_3ss, heatmap_downstream_5ss, + heatmap_upstream_3ss, heatmap_upstream_5ss + ]) + + # Save coverage data + plotting_df.to_csv( + f'{output_dir}/{FILEname}_RNAmap.tsv', sep="\t", index=False + ) - if not all_sites: - mv_col_order = ["upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss"] - mv_titles = ["Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS"] - mv_col_wrap = 4 - else: - mv_col_order = ["upstream_3ss", "upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss", "downstream_5ss"] - mv_titles = ["Upstream 3'SS", "Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS", "Downstream 5'SS"] - mv_col_wrap = 6 - - g = sns.relplot(data=plotting_df, x='position', y='smoothed_kmer_multivalency', hue='exon_type', - col='type', facet_kws={"sharex": False}, - kind='line', col_wrap=mv_col_wrap, height=5, aspect=3.5/5, errorbar=None, - col_order=mv_col_order) - - logging.info("sns.relplot returned") - - titles = mv_titles - - for ax, title in zip(g.axes.flat, titles): - ax.set_title(title) - g.set(xlabel='') - g.axes[0].set_ylabel('mean smoothed kmer multivalency') - leg = g._legend - set_legend_text(leg, exon_categories, original_counts) - - # Configure each axis based on splice site type in mv_col_order - for i, ss_type in enumerate(mv_col_order): - ax = g.axes[i] - if i == 0: - ax.set_ylim(ymin=1) - - # Determine if this is a middle exon (use different color) - is_middle = ss_type.startswith('middle_') - exon_color = "midnightblue" if is_middle else "slategrey" - - if ss_type.endswith('_3ss'): - # 3' splice site: exon on right side - ax.set_xlim([window, (2 * window) + 50]) - ticks = np.arange(window, 2 * window + 50, 50) - labels = ["" if t == ticks[0] else str(int(t - window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - - # Exon rectangle on right - rect = matplotlib.patches.Rectangle( - xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - # Intron line on left - rect = matplotlib.patches.Rectangle( - xy=(0, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - else: - # 5' splice site: exon on left side - ax.set_xlim([2*window - 50, 3*window]) - ticks = np.arange(2*window - 50, 3*window, 50) - labels = ["" if t == ticks[0] else str(int(t - 2*window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - - # Exon rectangle on left - rect = matplotlib.patches.Rectangle( - xy=(0, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - # Intron line on right - rect = matplotlib.patches.Rectangle( - xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - - plt.subplots_adjust(wspace=0.01) - plt.savefig(f'{output_dir}/{FILEname}_RNAmap_multivalency.pdf', - bbox_extra_artists=([leg,rect]), - bbox_inches='tight', - pad_inches=0.5) - pbt.helpers.cleanup() - - logging.info(f'RNAmap multivalency plot saved successfully as {output_dir}/{FILEname}_RNAmap_multivalency.pdf') - - # DO THE SAME NOW FOR THE KMERS - a = middle_3ss_mdf[1] - b = middle_5ss_mdf[1] - c = downstream_3ss_mdf[1] - f = upstream_5ss_mdf[1] - - if not all_sites: - plotting_df = pd.concat([f, a, b, c]) - else: - d = downstream_5ss_mdf[1] - e = upstream_3ss_mdf[1] - plotting_df = pd.concat([a, b, c, d, e, f]) - plotting_df.to_csv(f'{output_dir}/{FILEname}_RNAmap_TOP10KMER_multivalency.tsv', sep="\t") - - logging.info(f'{output_dir}/{FILEname}_RNAmap_TOP10KMER_multivalency.tsv written to file successfully') - - # Filter the data for exon_type 'silenced' - filtered_df = plotting_df[plotting_df['exon_type'] == 'silenced'].copy() - - - # Now plot using the summarized DataFrame - plt.figure() - sns.set_style("whitegrid") - g = sns.relplot( - data=filtered_df, - x='position', - y='smoothed_kmer_multivalency', - hue='kmer', - col='type', - facet_kws={"sharex": False}, - kind='line', - col_wrap=mv_col_wrap, - height=5, errorbar=None, - aspect=3.5/5, - col_order=mv_col_order + # Heatmap + plot_heatmap(heat_df, exon_categories, window, args.all_sites, + output_dir, FILEname) + + # Main RNA map plot + logging.info("\n" + "=" * 60) + logging.info("PLOTTING RNA MAPS") + logging.info("=" * 60) + + plot_rna_map(plotting_df, exon_categories, original_counts, + window, args.all_sites, output_dir, FILEname) + + # ============================================================== + # MULTIVALENCY (optional, requires germs.R) + # ============================================================== + if hasattr(args, 'multivalency') and args.multivalency: + logging.info("\n" + "=" * 60) + logging.info("MULTIVALENCY ANALYSIS") + logging.info("=" * 60) + + plot_multivalency( + middle_3ss_bed, middle_5ss_bed, + downstream_3ss_bed, upstream_5ss_bed, + downstream_5ss_bed, upstream_3ss_bed, + args.fastaindex, args.window, args.genomefasta, + output_dir, FILEname, args.germsdir, + args.all_sites, exon_categories, original_counts ) - titles = mv_titles - for ax, title in zip(g.axes.flat, titles): - ax.set_title(title) - g.set(xlabel='') - g.axes[0].set_ylabel('mean smoothed kmer multivalency') - leg = g._legend - - # Configure each axis based on splice site type in mv_col_order - for i, ss_type in enumerate(mv_col_order): - ax = g.axes[i] - if i == 0: - ax.set_ylim(ymin=1) - - is_middle = ss_type.startswith('middle_') - exon_color = "midnightblue" if is_middle else "slategrey" - - if ss_type.endswith('_3ss'): - ax.set_xlim([window, (2 * window) + 50]) - ticks = np.arange(window, 2 * window + 50, 50) - labels = ["" if t == ticks[0] else str(int(t - window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - rect = matplotlib.patches.Rectangle( - xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - rect = matplotlib.patches.Rectangle( - xy=(0, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - else: - ax.set_xlim([2*window - 50, 3*window]) - ticks = np.arange(2*window - 50, 3*window, 50) - labels = ["" if t == ticks[0] else str(int(t - 2*window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - rect = matplotlib.patches.Rectangle( - xy=(0, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - rect = matplotlib.patches.Rectangle( - xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - - plt.subplots_adjust(wspace=0.01) - plt.savefig(f'{output_dir}/{FILEname}_RNAmap_silencedKMER_multivalency.pdf', - bbox_extra_artists=([leg,rect]), - bbox_inches='tight', - pad_inches=0.5) - pbt.helpers.cleanup() - - logging.info(f'{output_dir}/{FILEname}_RNAmap_silencedKMER_multivalency.pdf written to file successfully') - - plt.figure() - sns.set_style("whitegrid") - g = sns.relplot(data=plotting_df[plotting_df['exon_type'] == 'enhanced'], x='position', y='smoothed_kmer_multivalency', hue='kmer', col='type', facet_kws={"sharex":False}, - kind='line', col_wrap=mv_col_wrap, height=5, aspect=3.5/5, errorbar=None, - col_order=mv_col_order) - titles = mv_titles - for ax, title in zip(g.axes.flat, titles): - ax.set_title(title) - g.set(xlabel='') - g.axes[0].set_ylabel('mean smoothed kmer multivalency') - leg = g._legend - - # Configure each axis based on splice site type in mv_col_order - for i, ss_type in enumerate(mv_col_order): - ax = g.axes[i] - if i == 0: - ax.set_ylim(ymin=1) - - is_middle = ss_type.startswith('middle_') - exon_color = "midnightblue" if is_middle else "slategrey" - - if ss_type.endswith('_3ss'): - ax.set_xlim([window, (2 * window) + 50]) - ticks = np.arange(window, 2 * window + 50, 50) - labels = ["" if t == ticks[0] else str(int(t - window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - rect = matplotlib.patches.Rectangle( - xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - rect = matplotlib.patches.Rectangle( - xy=(0, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - else: - ax.set_xlim([2*window - 50, 3*window]) - ticks = np.arange(2*window - 50, 3*window, 50) - labels = ["" if t == ticks[0] else str(int(t - 2*window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - rect = matplotlib.patches.Rectangle( - xy=(0, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - rect = matplotlib.patches.Rectangle( - xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - - plt.subplots_adjust(wspace=0.01) - plt.savefig(f'{output_dir}/{FILEname}_RNAmap_enhancedKMER_multivalency.pdf', - bbox_extra_artists=([leg,rect]), - bbox_inches='tight', - pad_inches=0.5) - pbt.helpers.cleanup() - sys.exit() - - - -if __name__=='__main__': - ( - de_file, - xl_bed, - genome_fasta, - fai, - output_folder, - window, - smoothing, - min_ctrl, - max_ctrl, - max_inclusion, - max_fdr, - max_enh, - min_sil, - multivalency, - germsdir, - no_constitutive, - no_subset, - all_sites, - prefix - ) = cli() - - log_filename, start_time, logger = setup_logging(output_folder) - logging.info(f"Log file created: {log_filename}") - try: - run_rna_map(de_file, xl_bed, genome_fasta, fai, window, smoothing, - min_ctrl, max_ctrl, max_inclusion, max_fdr, max_enh, min_sil, output_folder, multivalency, germsdir, no_constitutive, no_subset, all_sites, prefix) + logging.info("\n" + "=" * 60) + logging.info("SCRIPT COMPLETED SUCCESSFULLY") + logging.info("=" * 60) + finally: - # Log runtime at the end log_runtime(start_time, logger) - - # Explicitly flush logs before shutting down for handler in logger.handlers: handler.flush() + logging.shutdown() + + +# =============================================================================== +# COMMAND LINE INTERFACE +# =============================================================================== + +def cli(): + parser = argparse.ArgumentParser( + description='Plot CLIP crosslinks around regulated exons to study ' + 'position-dependent impact on pre-mRNA splicing.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Input modes: + + rMATS mode (original): + python rna_maps.py -i rMATS.SE.MATS.JC.txt -x CLIP.bed -f hg38.fa -fi hg38.fa.fai -o output -p PTBP1 + + VastDB mode (ID lists): + python rna_maps.py --vastdb_mode \\ + --vastdb_enhanced enhanced.txt --vastdb_silenced silenced.txt \\ + --vastdb_control control.txt --vastdb_constitutive constitutive.txt \\ + --vastdb_annotation EVENT_INFO-hg38.tab \\ + -x CLIP.bed -f hg38.fa -fi hg38.fa.fai -o output -p AQR_K562 + """ + ) + + # INPUT MODE - mutually exclusive + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument( + '-i', '--inputsplice', type=str, + help='rMATS differential splicing file (rMATS mode)' + ) + input_group.add_argument( + '--vastdb_mode', action='store_true', + help='Use VastDB ID lists mode (requires --vastdb_* arguments)' + ) + + # VASTDB-SPECIFIC ARGUMENTS + vastdb_group = parser.add_argument_group('VastDB mode options') + vastdb_group.add_argument( + '--vastdb_enhanced', help='Enhanced exon IDs (one per line)') + vastdb_group.add_argument( + '--vastdb_silenced', help='Silenced exon IDs (one per line)') + vastdb_group.add_argument( + '--vastdb_control', help='Control exon IDs (one per line)') + vastdb_group.add_argument( + '--vastdb_constitutive', help='Constitutive exon IDs (one per line)') + vastdb_group.add_argument( + '--vastdb_annotation', help='VastDB EVENT_INFO file (e.g. EVENT_INFO-hg38.tab)') + + # SHARED REQUIRED ARGUMENTS + required = parser.add_argument_group('Required arguments (both modes)') + required.add_argument( + '-x', '--inputxlsites', type=str, nargs='?', + help='CLIP crosslinks in BED file format') + required.add_argument( + '-f', '--genomefasta', type=str, required=True, + help='Genome FASTA file (.fa)') + required.add_argument( + '-fi', '--fastaindex', type=str, required=True, + help='Genome FASTA index (.fai)') + + # SHARED OPTIONAL ARGUMENTS + optional = parser.add_argument_group('Optional arguments') + optional.add_argument( + '-o', '--outputpath', type=str, default=os.getcwd(), nargs='?', + help='Output folder [DEFAULT: current directory]') + optional.add_argument( + '-w', '--window', type=int, default=300, nargs='?', + help='Window around splice sites [DEFAULT: 300]') + optional.add_argument( + '-s', '--smoothing', type=int, default=15, nargs='?', + help='Smoothing window [DEFAULT: 15]') + optional.add_argument( + '--seed', type=int, default=42, + help='Random seed for reproducible subsetting [DEFAULT: 42]') + optional.add_argument( + '-nc', '--no_constitutive', action='store_true', + help='Exclude constitutive category') + optional.add_argument( + '-ns', '--no_subset', action='store_true', + help='Disable subsetting of control/constitutive exons') + optional.add_argument( + '-ao', '--all_sites', action='store_true', + help='Include all 6 splice sites (default: 4 core sites)') + optional.add_argument( + '-p', '--prefix', type=str, + help='Prefix for output files') + + # rMATS-SPECIFIC THRESHOLDS + rmats_group = parser.add_argument_group('rMATS mode thresholds') + rmats_group.add_argument( + '-mc', '--minctrl', type=float, default=-0.05, nargs='?', + help='Minimum dPSI for control events [DEFAULT: -0.05]') + rmats_group.add_argument( + '-xc', '--maxctrl', type=float, default=0.05, nargs='?', + help='Maximum dPSI for control events [DEFAULT: 0.05]') + rmats_group.add_argument( + '-xi', '--maxincl', type=float, default=0.9, nargs='?', + help='Maximum PSI for control (above = constitutive) [DEFAULT: 0.9]') + rmats_group.add_argument( + '-xf', '--maxfdr', type=float, default=0.1, nargs='?', + help='Maximum FDR for regulated events [DEFAULT: 0.1]') + rmats_group.add_argument( + '-xe', '--maxenh', type=float, default=-0.05, nargs='?', + help='Maximum dPSI for enhanced exons [DEFAULT: -0.05]') + rmats_group.add_argument( + '-ms', '--minsil', type=float, default=0.05, nargs='?', + help='Minimum dPSI for silenced exons [DEFAULT: 0.05]') + + # MULTIVALENCY (rMATS mode) + mv_group = parser.add_argument_group('Multivalency analysis') + mv_group.add_argument( + '-v', '--multivalency', action='store_true', + help='Run multivalency analysis (requires germs.R)') + mv_group.add_argument( + '-g', '--germsdir', type=str, default=os.getcwd(), nargs='?', + help='Directory containing germs.R [DEFAULT: current directory]') + + args = parser.parse_args() + + # Validate VastDB mode requirements + if args.vastdb_mode: + if not args.vastdb_annotation: + parser.error("--vastdb_mode requires --vastdb_annotation " + "(EVENT_INFO file)") + if not any([args.vastdb_enhanced, args.vastdb_silenced, + args.vastdb_control, args.vastdb_constitutive]): + parser.error("--vastdb_mode requires at least one " + "--vastdb_* ID list file") + + return args + + +# =============================================================================== +# ENTRY POINT +# =============================================================================== - # Now safe to shut down logging - logging.shutdown() \ No newline at end of file +if __name__ == '__main__': + args = cli() + run_rna_map(args) \ No newline at end of file From 1221fd6ba53fcce690822ba172eb12252bdfbba6 Mon Sep 17 00:00:00 2001 From: Leo Wilkinson Date: Wed, 18 Mar 2026 19:35:01 +0000 Subject: [PATCH 4/5] Updated readme for merged rna_maps and VastDB script --- README.md | 325 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 179 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index a5568a2..62e359a 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,20 @@ ## RNA maps -Authors: charlotte.capitanchik@crick.ac.uk; aram.amalietti@gmail.com +Authors: charlotte.capitanchik@crick.ac.uk; aram.amalietti@gmail.com; leomwilkinson@gmail.com --- -> **Branch: VastDB_input** -> This branch introduces `rna_maps_vastdb_only.py`, a standalone script for generating RNA maps directly from VastDB EVENT IDs — no rMATS required. See [VastDB RNA Maps](#vastdb-rna-maps-rna_maps_vastdb_onlypy) below. +## Overview ---- +`rna_maps.py` generates RNA maps showing positional enrichment of RBP binding (from CLIP data) around regulated exons. It supports two input modes: + +1. **rMATS mode** — takes rMATS differential splicing output and auto-categorises exons from dPSI/FDR thresholds +2. **VastDB mode** — takes pre-curated VastDB EVENT ID lists with categories already assigned -## VastDB RNA Maps (`rna_maps_vastdb_only.py`) +Both modes feed into the same analysis pipeline: splice site BED creation, CLIP coverage calculation, Fisher's exact test enrichment, RNA map plotting, per-exon heatmaps, and exon length distributions. -This script generates RNA maps from pre-categorised VastDB cassette exon EVENT IDs. Instead of running differential splicing analysis with rMATS, you supply lists of EVENT IDs already assigned to categories (enhanced, silenced, control, constitutive), along with a VastDB annotation file to look up genomic coordinates. +--- -**Quick Start** +## Quick Start Create a conda environment and activate it: @@ -21,10 +23,23 @@ conda env create -f environment.yml conda activate rnamaps ``` -Run with VastDB inputs: +### rMATS mode ``` -python rna_maps_vastdb_only.py \ +python rna_maps.py \ + -i SE.MATS.JCEC.txt \ + -x CLIP_crosslinks.bed \ + -f genome.fa \ + -fi genome.fa.fai \ + -o output/ \ + -p PTBP1 +``` + +### VastDB mode + +``` +python rna_maps.py \ + --vastdb_mode \ --vastdb_enhanced enhanced_ids.txt \ --vastdb_silenced silenced_ids.txt \ --vastdb_control control_ids.txt \ @@ -37,175 +52,160 @@ python rna_maps_vastdb_only.py \ -p PTBP1 ``` -**Preparing inputs** +--- + +## Input modes + +### rMATS mode (`-i`) + +Accepts rMATS quantified files for cassette exons (e.g. `SE.MATS.JCEC.txt`). Categories are assigned automatically based on dPSI and FDR thresholds (see [Definitions](#definitions) below). The script deduplicates exons involved in multiple events by keeping the most extreme dPSI value per exon. + +If your condition is RBP knockdown, run your comparison as condition − control, such that definitions of enhanced and silenced are correct. If your condition is RBP overexpression, run as control − condition. In the generic comparison group1 − group2, "enhanced" and "silenced" are defined in reference to group2 relative to group1. + +**Minus-strand correction:** rMATS labels flanking exons as "upstream" and "downstream" by genomic coordinate (lower = upstream), which is inverted for minus-strand genes. The script automatically swaps these at load time so that upstream/downstream always refer to transcript order throughout the pipeline. + +### VastDB mode (`--vastdb_mode`) -- **EVENT ID lists**: Plain text files with one VastDB EVENT ID per line (e.g. `HsaEX0012345`). Lines beginning with `#` are ignored. -- **VastDB annotation file**: The `EVENT_INFO-*.tab` file from VastDB, used to look up genomic coordinates (chromosome, exon start/end, flanking exon coordinates, strand) for each EVENT ID. -- **CLIP crosslinks**: iCLIP or eCLIP crosslinks in BED format. -- **Genome FASTA + index**: Reference genome `.fa` and `.fa.fai` files. +Accepts four plain text files of VastDB EVENT IDs (one per line, `#` comments allowed), each representing a pre-assigned category. Genomic coordinates are looked up from the VastDB `EVENT_INFO` annotation file using the `COORD_o`, `CO_C1`, and `CO_C2` columns. -**Key differences from `rna_maps.py`** +This mode is useful when categories come from VAST-TOOLS `vast diff` output (or any other source of splicing quantification), where you have already applied your own thresholds to define enhanced, silenced, control, and constitutive exons. No rMATS dependency is required. -| Feature | `rna_maps.py` | `rna_maps_vastdb_only.py` | +| Feature | rMATS mode | VastDB mode | |---|---|---| | Splicing input | rMATS `SE.MATS.JCEC.txt` | VastDB EVENT ID lists | | Category assignment | Computed from dPSI / FDR thresholds | Pre-assigned by user | -| Annotation source | rMATS coordinates | VastDB `EVENT_INFO` file | +| Coordinate source | rMATS columns | VastDB `EVENT_INFO` file | +| Coordinate system | 0-based (BED) | 1-based (converted automatically) | | rMATS dependency | Required | Not required | +| Expression filtering | From junction counts | Not applicable | -**Usage** +--- + +## Preparing inputs + +### CLIP crosslinks + +iCLIP or eCLIP crosslink sites in BED format. Single-nucleotide resolution crosslink files (`.genome.xl.bed`) are preferred over peak files for RNA maps. + +### Genome reference + +Genome FASTA (`.fa`) and FASTA index (`.fa.fai`) files. The genome build must match both the splicing data and the CLIP data — e.g. hg19 for older rMATS datasets, hg38 for VastDB. + +### rMATS input + +Standard rMATS output for skipped exons, e.g. `SE.MATS.JCEC.txt`. Must contain columns: `chr`, `exonStart_0base`, `exonEnd`, `strand`, `FDR`, `IncLevelDifference`, `IncLevel1`, `IncLevel2`, `upstreamES`, `upstreamEE`, `downstreamES`, `downstreamEE`. + +### VastDB inputs + +- **EVENT ID lists**: Plain text files with one VastDB EVENT ID per line (e.g. `HsaEX0012345`). Lines beginning with `#` are ignored. At least one list file must be provided; typically all four categories are supplied. +- **VastDB annotation file**: The `EVENT_INFO-*.tab` file from VastDB. Must contain `EVENT`, `GENE`, `COORD_o`, `REF_CO`, `CO_C1`, and `CO_C2` columns. The version in your `event_lists/` directory (with the full `CO_C1`/`CO_C2` columns) is required — not the minimal version. + +--- + +## Usage ``` -python rna_maps_vastdb_only.py -h +python rna_maps.py -h -required arguments: - --vastdb_annotation VastDB EVENT_INFO file - -x, --clip CLIP crosslinks BED file - -f, --fasta Genome FASTA file - -fi, --fasta_index Genome FASTA index (.fai) - -o, --output_dir Output directory - -p, --prefix Output file prefix +Input modes (mutually exclusive, one required): + -i, --inputsplice rMATS differential splicing file (rMATS mode) + --vastdb_mode Use VastDB ID lists mode -VastDB ID lists (at least one required): +VastDB mode options: --vastdb_enhanced Enhanced exon IDs (one per line) --vastdb_silenced Silenced exon IDs (one per line) --vastdb_control Control exon IDs (one per line) --vastdb_constitutive Constitutive exon IDs (one per line) + --vastdb_annotation VastDB EVENT_INFO file + +Required arguments (both modes): + -x, --inputxlsites CLIP crosslinks in BED file format + -f, --genomefasta Genome FASTA file (.fa) + -fi, --fastaindex Genome FASTA index (.fai) -optional arguments: - -w, --window Window around splice sites [DEFAULT 300] - -s, --smoothing Smoothing window [DEFAULT 15] +Optional arguments: + -o, --outputpath Output folder [DEFAULT: current directory] + -w, --window Window around splice sites [DEFAULT: 300] + -s, --smoothing Smoothing window [DEFAULT: 15] + --seed Random seed for reproducible subsetting [DEFAULT: 42] -nc, --no_constitutive Exclude constitutive category - -ns, --no_subset Disable subsetting of control/constitutive to match enhanced/silenced counts + -ns, --no_subset Disable subsetting of control/constitutive exons -ao, --all_sites Include all 6 splice sites (default: 4 core sites) + -p, --prefix Prefix for output files + +rMATS mode thresholds: + -mc, --minctrl Minimum dPSI for control events [DEFAULT: -0.05] + -xc, --maxctrl Maximum dPSI for control events [DEFAULT: 0.05] + -xi, --maxincl Maximum PSI for control (above = constitutive) [DEFAULT: 0.9] + -xf, --maxfdr Maximum FDR for regulated events [DEFAULT: 0.1] + -xe, --maxenh Maximum dPSI for enhanced exons [DEFAULT: -0.05] + -ms, --minsil Minimum dPSI for silenced exons [DEFAULT: 0.05] + +Multivalency analysis: + -v, --multivalency Run multivalency analysis (requires germs.R) + -g, --germsdir Directory containing germs.R [DEFAULT: current directory] ``` -**Outputs** - -- `{prefix}_VastDB_with_categories.tsv` — categorised exons with coordinates -- `{prefix}_RNAmap.tsv` — per-position coverage and enrichment data -- `{prefix}_RNAmap_-log10pvalue.pdf` — RNA map plot -- `execution_*.log` — run log - --- -## Original RNA Maps (`rna_maps.py`) +## Outputs -The original script accepts rMATS quantified splicing files and assigns categories based on dPSI and FDR thresholds. +Both modes produce the same set of output files: -**Quick Start** +| File | Description | +|---|---| +| `{prefix}_RMATS_with_categories.tsv` or `{prefix}_VastDB_with_categories.tsv` | Categorised exons with coordinates | +| `{prefix}_RNAmap.tsv` | Per-position coverage and enrichment data | +| `{prefix}_RNAmap_-log10pvalue.pdf` | RNA map line plot | +| `{prefix}_heatmap.pdf` | Per-exon binary coverage heatmap | +| `{prefix}_totalExonsCovered.tsv` | Count of exons with CLIP signal per region and category | +| `{prefix}_exon_length.pdf` | Exon length distributions by category | +| `execution_*.log` | Run log with timing and category counts | -``` -python rna_maps.py \ --i test/chr21_PTBP1_2_Gueroussov2015_SE.MATS.JCEC.txt \ --x test/chr21_hela_ptbp1_iclip_sorted_merged.bed \ --f test/homosapien-hg37-chr21.fa \ --fi test/homosapien-hg37-chr21.fa.fai -``` +With `--multivalency` (rMATS mode): -**Preparing RNA-Seq data**: +| File | Description | +|---|---| +| `{prefix}_RNAmap_multivalency.tsv` | Multivalency scores per position | +| `{prefix}_RNAmap_multivalency.pdf` | Multivalency plot | +| `{prefix}_RNAmap_TOP10KMER_multivalency.tsv` | Top kmer multivalency scores | +| `{prefix}_RNAmap_silencedKMER_multivalency.pdf` | Silenced exon kmer plot | +| `{prefix}_RNAmap_enhancedKMER_multivalency.pdf` | Enhanced exon kmer plot | -This code accepts rMATs quantified files for cassette exons (e.g. SE.MATS.JCEC.txt). +--- -If your condition is RBP knockdown be sure to run your comparison as condition - control, such that definitions of enhanced and repressed are correct. If your condition is RBP overexpression you will need to run the comparison as control - condition. In the generic example group1 - group2 consider that the definition of "enhanced" or "repressed" are in reference to group2. ie. an exon is enhanced in group2 vs. group1. +## Multivalency analysis -**Multivalency analysis**: +Multivalency analysis adds run time and requires the Ule lab's GeRMs package. It is optional and enabled with the `-v` flag. -Multivalency analysis adds on run time & involves installing the Ule lab's GeRMs package which is still in development, so it is optional and enabled with the flag `-v`. -Currently to run the analysis you will need to install the GeRMs package. To do this clone the repository to your computer somewhere and run the following command from within the repository (you will need to have R devtools installed): -`R -e 'devtools::install()'` +To install GeRMs, clone the repository and run from within it (requires R devtools): -You will need to ensure you have the GeRMs requirements installed too, which are: biostrings, parallel, logger and optparse. -Finally, when you run RNA maps you will need to provide the location of your "germs" repo, so that the script can find germs.R to run the multivalency calculations using the flag `-g`, so our test command for running multivalency will look like: ``` -python rna_maps.py \ --i test/chr21_PTBP1_2_Gueroussov2015_SE.MATS.JCEC.txt \ --x test/chr21_hela_ptbp1_iclip_sorted_merged.bed \ --f test/homosapien-hg37-chr21.fa \ --fi test/homosapien-hg37-chr21.fa.fai \ --v -g ../germs +R -e 'devtools::install()' ``` -If you want to create a multivalency map alone (no CLIP data) simply run the above command with the `-x crosslinks.bed` excluded. -**Usage**: -``` -python rna_maps.py -h -usage: rna_maps.py [-h] -i INPUTSPLICE [-x [INPUTXLSITES]] -f GENOMEFASTA -fi - FASTAINDEX [-o [OUTPUTPATH]] [-w [WINDOW]] [-s [SMOOTHING]] - [-mc [MINCTRL]] [-xc [MAXCTRL]] [-xi [MAXINCL]] [-xf [MAXFDR]] - [-xe [MAXENH]] [-ms [MINSIL]] [-v] [-nc] [-ns] [-ao] [-g [GERMSDIR]] - [-p PREFIX] - -Plot CLIP crosslinks around regulated exons to study position-dependent impact on pre- -mRNA splicing. - -required arguments: - -i INPUTSPLICE, --inputsplice INPUTSPLICE - quantification of differential splicing produced by rMATS - -f GENOMEFASTA, --genomefasta GENOMEFASTA - genome fasta file (.fa) - -fi FASTAINDEX, --fastaindex FASTAINDEX - genome fasta index file (.fai) - -options: - -h, --help show this help message and exit - -x [INPUTXLSITES], --inputxlsites [INPUTXLSITES] - CLIP crosslinks in BED file format - -o [OUTPUTPATH], --outputpath [OUTPUTPATH] - output folder [DEFAULT current directory] - -w [WINDOW], --window [WINDOW] - window around regulated splicing events to plot crosslinks - [DEFAULT 300] - -s [SMOOTHING], --smoothing [SMOOTHING] - smoothing window for plotting crosslink signal [DEFAULT 15] - -mc [MINCTRL], --minctrl [MINCTRL] - minimum dPSI for control events [DEFAULT -0.05] - -xc [MAXCTRL], --maxctrl [MAXCTRL] - maximum dPSI for control events [DEFAULT 0.05] - -xi [MAXINCL], --maxincl [MAXINCL] - maximum PSI for control exons, above this limit exons are - considered constitutive [DEFAULT 0.9] - -xf [MAXFDR], --maxfdr [MAXFDR] - maximum FDR for regulated events, above this events fall in - "rest" class, is used for rMATS [DEFAULT 0.1] - -xe [MAXENH], --maxenh [MAXENH] - maximum inclusion for exons to be considered enhanced [DEFAULT - -0.05] - -ms [MINSIL], --minsil [MINSIL] - minimum inclusion for exons to be considered silenced [DEFAULT - 0.05] - -v, --multivalency - -nc, --no_constitutive - Exclude constitutive category from the output - -ns, --no_subset Disable subsetting of control/constitutive exons to match - enhanced/silenced counts - -ao, --all_sites Include all splice sites (upstream_3ss and downstream_5ss), - default is core sites only - -g [GERMSDIR], --germsdir [GERMSDIR] - directory for where to find germs.R for multivalency analysis - eg. /Users/Bellinda/repos/germs [DEFAULT current directory] - -p PREFIX, --prefix PREFIX - prefix for output files [DEFAULT inputsplice file name] -``` - -**Dependencies** (these are the versions the script was developped with, pandas >= 1 introduced breaking changes, please use these versions): +GeRMs requires: Biostrings, parallel, logger, and optparse. + +When running RNA maps with multivalency, provide the location of the germs repo with `-g`: + ``` -python=3.7.7 -pandas=0.24.2 -numpy=1.19.2 -pybedtools=0.8.1 -matplotlib=3.3.2 -seaborn=0.11.0 -scipy=1.3.1 +python rna_maps.py \ + -i SE.MATS.JCEC.txt \ + -x CLIP_crosslinks.bed \ + -f genome.fa \ + -fi genome.fa.fai \ + -v -g ../germs ``` +To create a multivalency map without CLIP data, run the above command without the `-x` flag. + --- -### Definitions +## Definitions -**Event types** +### Event types -Control: An event that doesn't change in inclusion (PSI) in this RBP knockdown, but might in another circumstance. Typical definition: +**Control**: An event that doesn't change in inclusion (PSI) in this RBP knockdown, but might in another circumstance. ``` dPSI ( -1 <---------- - 0.05xxxxx0xxxxx0.05----------> 1 ) @@ -213,7 +213,7 @@ maxPSI ( 0 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx0.9------> 1 ) FDR ( 0 xxxxx0.1xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx> 1 ) ``` -Constituitive: An event that doesn't change in inclusion (PSI) in this RBP knockdown, but is unlikely to change in another circumstance. Typically defined as a control event *plus* have a maximum inclusion (PSI) of > 0.9-0.99. +**Constitutive**: An event that doesn't change in inclusion in this knockdown and is unlikely to change in another circumstance. Defined as a control event with maximum inclusion (PSI) > 0.9. ``` dPSI ( -1 <---------- - 0.05xxxxx0xxxxx0.05----------> 1 ) @@ -221,7 +221,7 @@ maxPSI ( 0 ----------------------------------0.9xxxxxx> 1 ) FDR ( 0 xxxxx0.1xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx> 1 ) ``` -Enhanced: An event that is *less* included in RBP knockdown, suggesting the RBP *promotes/enhances* inclusion of the event. +**Enhanced**: An event that is *less* included in RBP knockdown, suggesting the RBP *promotes/enhances* inclusion of the event. ``` dPSI ( -1 1 ) @@ -229,7 +229,7 @@ maxPSI ( 0 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx0.9xxxxxx> 1 ) FDR ( 0 xxxxx0.1-----------------------------------> 1 ) ``` -Silenced: An event that is *more* included in RBP knockdown, suggesting the RBP *represses/silences* inclusion of the event. +**Silenced**: An event that is *more* included in RBP knockdown, suggesting the RBP *represses/silences* inclusion of the event. ``` dPSI ( -1 <---------- - 0.05-----0-----0.05xxxxxxxxxx> 1 ) @@ -237,7 +237,7 @@ maxPSI ( 0 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx0.9xxxxxx> 1 ) FDR ( 0 xxxxx0.1-----------------------------------> 1 ) ``` -Enhanced/Silenced rest: A silenced or enhanced event where the FDR does not fall below the threshold. +**Enhanced/Silenced rest** (rMATS mode only): A silenced or enhanced event where the FDR does not fall below the threshold. ``` dPSI ( As in silenced or enhanced ) @@ -245,11 +245,44 @@ maxPSI ( 0 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx0.9xxxxxx> 1 ) FDR ( 0 -----0.1xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx> 1 ) ``` -**Hierarchy** +### Hierarchy (rMATS mode) + +An exon may be involved in multiple events. To avoid plotting it multiple times, a hierarchy is applied: + +1. If an exon meets criteria for silenced or enhanced, this is designated. If criteria for both are met, the most extreme dPSI value is preferred. +2. Of remaining exons, if they meet criteria for enhanced/silenced rest, this is designated. +3. Of remaining exons, if they meet criteria for constitutive, this is designated. +4. Of remaining exons, if they meet criteria for control, this is designated. + +### VastDB mode categories + +In VastDB mode, categories are assigned by the user before running the script. Typical thresholds when using VAST-TOOLS `vast diff` output: + +| Category | E[dPsi] | MV[dPsi]_at_0.95 | maxPSI | +|---|---|---|---| +| Enhanced | < −0.10 | > 0.05 | — | +| Silenced | > 0.10 | > 0.05 | — | +| Control | \|E[dPsi]\| < 0.05 | — | < 0.9 | +| Constitutive | \|E[dPsi]\| < 0.05 | — | ≥ 0.9 | + +--- + +## Dependencies + +These are the versions the script was developed with (pandas >= 1 introduced breaking changes): + +``` +python=3.7.7 +pandas=0.24.2 +numpy=1.19.2 +pybedtools=0.8.1 +matplotlib=3.3.2 +seaborn=0.11.0 +scipy=1.3.1 +``` + +--- -When it comes to alternative exons, an exon may be involved in multiple events, but we want to avoid plotting it many times, so we implement a hierarchy: +## Reproducibility -1. If an exon meets criteria for silenced or enhanced this is designated, if criteria for both is met the most extreme dPSI value is preferred. -2. Of remaining exons, if they meet criteria for enhanced/silenced rest this is designated, if criteria for both is met the most extreme dPSI value is preferred. -3. Of remaining exons, if they meet critera for constituitive, this is designated. -4. Of remaining exons, if they meet critera for control, this is designated. +The `--seed` flag (default: 42) controls the random seed used when subsetting control and constitutive exons to match the size of the largest regulated category. Setting the same seed produces identical subsets across runs. Disable subsetting entirely with `--no_subset`. \ No newline at end of file From 0f40fcef567352d1fc7925c12bffdcb7d07e155d Mon Sep 17 00:00:00 2001 From: Leo Wilkinson Date: Wed, 18 Mar 2026 19:52:58 +0000 Subject: [PATCH 5/5] Delete rna_maps_vastdb_only.py --- rna_maps_vastdb_only.py | 785 ---------------------------------------- 1 file changed, 785 deletions(-) delete mode 100644 rna_maps_vastdb_only.py diff --git a/rna_maps_vastdb_only.py b/rna_maps_vastdb_only.py deleted file mode 100644 index aead0d0..0000000 --- a/rna_maps_vastdb_only.py +++ /dev/null @@ -1,785 +0,0 @@ -#!/usr/bin/env python3 -""" -RNA Maps from VastDB ID Lists - -Generates RNA maps showing RBP binding patterns around categorized exons. -Uses VastDB EVENT IDs with pre-assigned categories (enhanced, silenced, control, constitutive). - -Key features: -- 6 splice site regions (upstream_3ss, upstream_5ss, middle_3ss, middle_5ss, downstream_3ss, downstream_5ss) -- Coverage plots with statistical enrichment -- Heatmaps -- No rMATS dependency -""" - -import matplotlib -import matplotlib.ticker as mticker -from matplotlib.colors import LinearSegmentedColormap -from matplotlib.gridspec import GridSpec -from matplotlib import colormaps -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['ps.fonttype'] = 42 - -from scipy.ndimage import gaussian_filter1d -from matplotlib.lines import Line2D -import pandas as pd -import numpy as np -import pybedtools as pbt -import seaborn as sns -import matplotlib.pyplot as plt -import scipy.stats as stats -import sys -import os -import argparse -import random -import string -import logging -import datetime -import time -import re - -# =============================================================================== -# CONFIGURATION -# =============================================================================== - -sns.set_style("whitegrid", {'legend.frameon':True}) -colors_dict = { - 'all': '#D3D3D3', - 'ctrl': '#408F76', - 'enh': '#F30C08', - 'sil': '#005299', - 'enhrest': '#FFB122', - 'silrest': '#6DC2F5', - 'const': '#666666' -} -linewidth = 3 - -# =============================================================================== -# UTILITY FUNCTIONS -# =============================================================================== - -def setup_logging(output_path): - """Sets up logging to file and console.""" - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - log_filename = f"execution_{timestamp}.log" - log_path = os.path.join(output_path, log_filename) - - logger = logging.getLogger() - logger.handlers.clear() - logger.setLevel(logging.INFO) - - file_handler = logging.FileHandler(log_path, mode='w') - file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) - - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) - - logger.addHandler(file_handler) - logger.addHandler(stream_handler) - - file_handler.flush = file_handler.stream.flush - - start_time = time.time() - - logging.getLogger('matplotlib').setLevel(logging.WARNING) - logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) - logging.getLogger('matplotlib.backends').setLevel(logging.ERROR) - logging.getLogger('fontTools').setLevel(logging.ERROR) - logging.getLogger('fontTools.subset').setLevel(logging.ERROR) - - logger.info(f"Starting script execution at {timestamp}") - - return log_filename, start_time, logger - - -def log_runtime(start_time, logger): - """Logs the script execution time.""" - end_time = time.time() - total_seconds = int(end_time - start_time) - minutes, seconds = divmod(total_seconds, 60) - runtime_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s" - logger.info(f"Script completed in {runtime_str}.") - for handler in logger.handlers: - handler.flush() - - -def smooth_coverage(df, window_size=10, std=2): - """Smooth coverage data using rolling Gaussian window.""" - result = df.copy() - groups = [] - - for (exon_id, label), group_df in result.groupby(['exon_id', 'label']): - if len(group_df) < 3: - groups.append(group_df) - continue - - group_sorted = group_df.sort_values('position') - - if len(group_sorted) >= window_size: - values = group_sorted['coverage'].values - s = pd.Series(values) - - smoothed = s.rolling( - window=window_size, - center=True, - win_type='gaussian' - ).mean(std=std) - - smoothed = smoothed.fillna(s) - group_sorted['coverage'] = smoothed.values - - groups.append(group_sorted) - - return pd.concat(groups) - - -# =============================================================================== -# VASTDB PARSING FUNCTIONS -# =============================================================================== - -def parse_event_info_coordinates(event_info_df): - """ - Parse genomic coordinates from EVENT_INFO file. - Returns DataFrame with coordinates and flanking exon info. - """ - coords_list = [] - - for idx, row in event_info_df.iterrows(): - # Parse main exon coordinates from COORD_o - coord_str = str(row['COORD_o']) - match = re.match(r'(chr[\w]+):(\d+)-(\d+)', coord_str) - if not match: - continue - - # Extract strand from REF_CO - ref_co = str(row.get('REF_CO', '')) - strand_match = re.search(r':([+-])$', ref_co) - strand = strand_match.group(1) if strand_match else '+' - - # Parse flanking exon coordinates from CO_C1 and CO_C2 - # CO_C1 format: "chr:start-end" (upstream constitutive exon) - # CO_C2 format: "chr:start-end" (downstream constitutive exon) - upstream_start, upstream_end = np.nan, np.nan - downstream_start, downstream_end = np.nan, np.nan - - if pd.notna(row.get('CO_C1', np.nan)): - co_c1_match = re.match(r'chr[\w]+:(\d+)-(\d+)', str(row['CO_C1'])) - if co_c1_match: - upstream_start = int(co_c1_match.group(1)) - upstream_end = int(co_c1_match.group(2)) - - if pd.notna(row.get('CO_C2', np.nan)): - co_c2_match = re.match(r'chr[\w]+:(\d+)-(\d+)', str(row['CO_C2'])) - if co_c2_match: - downstream_start = int(co_c2_match.group(1)) - downstream_end = int(co_c2_match.group(2)) - - coords_list.append({ - 'EVENT': row['EVENT'], - 'GENE': row['GENE'], - 'chr': match.group(1), - 'exonStart_1based': int(match.group(2)), - 'exonEnd': int(match.group(3)), - 'strand': strand, - 'upstreamES': upstream_start, - 'upstreamEE': upstream_end, - 'downstreamES': downstream_start, - 'downstreamEE': downstream_end, - }) - - coords_df = pd.DataFrame(coords_list) - - if len(coords_df) > 0: - logging.info(f"Parsed coordinates for {len(coords_df)} events") - - return coords_df - - -def load_vastdb_categories(enhanced_file, silenced_file, control_file, - constitutive_file, event_info_file): - """ - Load VastDB ID lists and look up coordinates. - Returns DataFrame ready for RNA map generation. - """ - - logging.info("="*60) - logging.info("LOADING VASTDB ID LISTS") - logging.info("="*60) - - # Read ID lists - def read_id_list(filepath, category): - if filepath is None: - return [] - with open(filepath) as f: - ids = [line.strip() for line in f - if line.strip() and not line.startswith('#')] - logging.info(f"Loaded {len(ids)} {category} IDs") - return ids - - enhanced_ids = read_id_list(enhanced_file, 'enhanced') - silenced_ids = read_id_list(silenced_file, 'silenced') - control_ids = read_id_list(control_file, 'control') - constitutive_ids = read_id_list(constitutive_file, 'constitutive') - - all_ids = enhanced_ids + silenced_ids + control_ids + constitutive_ids - - if len(all_ids) == 0: - raise ValueError("No EVENT IDs provided in any file!") - - logging.info(f"Total EVENT IDs: {len(all_ids)}") - - # Create ID to category mapping - id_to_category = {} - for eid in enhanced_ids: - id_to_category[eid] = 'enhanced' - for eid in silenced_ids: - id_to_category[eid] = 'silenced' - for eid in control_ids: - id_to_category[eid] = 'control' - for eid in constitutive_ids: - id_to_category[eid] = 'constituitive' # Match original script spelling - - # Load EVENT_INFO - logging.info(f"Loading EVENT_INFO: {event_info_file}") - event_info = pd.read_csv(event_info_file, sep='\t') - logging.info(f"EVENT_INFO contains {len(event_info)} events") - - # Parse coordinates - event_coords = parse_event_info_coordinates(event_info) - - # Filter to our IDs - event_coords_subset = event_coords[event_coords['EVENT'].isin(all_ids)].copy() - - n_matched = len(event_coords_subset) - n_total = len(all_ids) - logging.info(f"Matched {n_matched}/{n_total} IDs to coordinates ({100*n_matched/n_total:.1f}%)") - - if n_matched == 0: - raise ValueError("No EVENT IDs matched to coordinates!") - - # Adjust coordinates from 1-based to 0-based - event_coords_subset['exonStart_0base'] = event_coords_subset['exonStart_1based'] - 1 - - # Also adjust flanking exon coordinates from 1-based to 0-based - event_coords_subset['upstreamES'] = event_coords_subset['upstreamES'] - 1 - event_coords_subset['downstreamES'] = event_coords_subset['downstreamES'] - 1 - - # Assign categories - event_coords_subset['category'] = event_coords_subset['EVENT'].map(id_to_category) - - # Add fake FDR column (not used but needed for compatibility) - event_coords_subset['FDR'] = 0.001 - - # Create exon_id for tracking - event_coords_subset['exon_id'] = ( - event_coords_subset['category'] + "_" + - event_coords_subset['chr'].astype(str) + ":" + - event_coords_subset['exonStart_0base'].astype(str) + "-" + - event_coords_subset['exonEnd'].astype(str) + ";" + - event_coords_subset['strand'].astype(str) - ) - - logging.info("\nCategory distribution:") - logging.info(event_coords_subset['category'].value_counts()) - - return event_coords_subset - - -# =============================================================================== -# RNA MAP GENERATION FUNCTIONS (from original script) -# =============================================================================== - -def get_ss_bed(df, pos_col, neg_col): - """Create BED file for splice sites (handles strand orientation).""" - # Strip "chr" prefix to match CLIP file format - df = df.copy() - - ss_pos = df.loc[df['strand'] == "+", ['chr', pos_col, pos_col, 'exon_id', 'FDR', 'strand']] - ss_pos.columns = ['chr', 'start', 'end', 'name', 'score', 'strand'] - ss_pos.start = ss_pos.start.transform(lambda x: x-1) - - ss_n = df.loc[df['strand'] == "-", ['chr', neg_col, neg_col, 'exon_id', 'FDR', 'strand']] - ss_n.columns = ['chr', 'start', 'end', 'name', 'score', 'strand'] - ss_n.end = ss_n.end.transform(lambda x: x+1) - - ss = pd.concat([ss_pos, ss_n]) - - return ss - - -def get_coverage_plot(xl_bed, df, fai, window, exon_categories, label, smoothing=15): - """Calculate coverage of CLIP around splice sites with statistical enrichment.""" - df = df.loc[df.name != "."] - xl_bed = pbt.BedTool(xl_bed).sort() - pbt_df = pbt.BedTool.from_dataframe( - df[['chr', 'start', 'end', 'name', 'score', 'strand']] - ).sort().slop(l=window, r=window, s=True, g=fai) - - df_coverage = pbt_df.coverage( - b=xl_bed, **{'sorted': True, 's': True, 'd': True, 'nonamecheck': True} - ).to_dataframe()[['thickStart', 'thickEnd', 'strand', 'name']] - - df_coverage.rename(columns=({'thickStart':'position','thickEnd':'coverage'}), inplace=True) - - df_plot = df_coverage - - # Adjust positions based on strand - df_plot.loc[df_plot.strand=='+', 'position'] = df_plot['position'].astype('int32') - df_plot.loc[df_plot.strand=='-', 'position'] = abs(2 * window + 2 - df_plot['position']) - - df_plot = df_plot.loc[df_plot.name != "."] - - df_plot = df_plot.join( - df_plot.pop('name').str.split('_',n=1,expand=True).rename(columns={0:'name', 1:'exon_id'}) - ) - - heatmap_plot = df_plot.copy() - heatmap_plot['label'] = label - - # Aggregate coverage - df_plot = df_plot.groupby(['name','position'], as_index=False).agg({'coverage':'sum'}) - - exon_cat = pd.DataFrame({'name':exon_categories.index, 'number_exons':exon_categories.values}) - df_plot = df_plot.merge(exon_cat, how="left") - - df_plot['norm_coverage'] = np.where( - df_plot['coverage'] == 0, - df_plot['coverage'], - df_plot['coverage']/df_plot['number_exons'] - ) - - # Fisher test against control - df_plot_ctrl = df_plot.loc[df_plot.name == "control"][["position","coverage","number_exons"]] - df_plot_ctrl.columns = ["position","control_coverage","control_number_exons"] - df_plot = df_plot.merge(df_plot_ctrl, how="left") - df_plot['control_norm_coverage'] = df_plot["control_coverage"] / df_plot["control_number_exons"] - - # Pseudo-count for zero values - df_plot.loc[df_plot['control_norm_coverage'] == 0, ['control_norm_coverage']] = 0.000001 - df_plot['fold_change'] = df_plot["norm_coverage"] / df_plot["control_norm_coverage"] - - # Statistical test - contingency_table = list(zip( - df_plot['coverage'], - df_plot['number_exons']-df_plot['coverage'], - df_plot['control_coverage'], - df_plot['control_number_exons'] - df_plot['control_coverage'] - )) - contingency_table = [np.array(table).reshape(2,2) for table in contingency_table] - df_plot['pvalue'] = [stats.fisher_exact(table)[1] for table in contingency_table] - - df_plot['-log10pvalue'] = np.log10(1/df_plot['pvalue']) - df_plot['label'] = label - - df_plot.loc[df_plot['fold_change'] < 1, ['-log10pvalue']] = df_plot['-log10pvalue'] * -1 - df_plot['-log10pvalue_smoothed'] = df_plot['-log10pvalue'].rolling( - smoothing, center=True, win_type="gaussian" - ).mean(std=2) - - return df_plot, heatmap_plot - - -def set_legend_text(legend, exon_categories, original_counts=None): - """Set legend text with optional subset information.""" - exon_cat = pd.DataFrame({'name': exon_categories.index, 'number_exons': exon_categories.values}) - categories = exon_cat['name'].unique() - legend.set_bbox_to_anchor([1.08, 0.75]) - legend.set_title("") - - legend_idx = 0 - - for category in ['constituitive', 'control', 'enhanced', 'silenced']: - if category in categories: - count = exon_cat[exon_cat['name'] == category]['number_exons'].values[0] - text = f"{category.capitalize()} ({count}" - - if original_counts and category in original_counts: - orig_count = original_counts[category] - if orig_count > count: - text += f", subset from {orig_count}" - - text += ")" - legend.texts[legend_idx].set_text(text) - legend_idx += 1 - - -def add_enrichment_marker(fig, ax): - """Add enrichment/depletion marker with arrows.""" - y_min, y_max = ax.get_ylim() - y_zero_position = 0 - marker_height = 0.4 - normalized_bottom = (y_zero_position - y_min) / (y_max - y_min) - marker_height / 2 - - marker_ax = fig.add_axes([0.94, normalized_bottom, 0.06, marker_height], - label=f"enrichment_marker_{ax.get_title()}") - - marker_ax.set_xticks([]) - marker_ax.set_yticks([]) - for spine in marker_ax.spines.values(): - spine.set_visible(False) - - marker_ax.text(0.5, 0.5, "↑\nenriched\n\n\n↓\ndepleted", - ha='center', va='center', fontsize=11, - transform=marker_ax.transAxes) - - return marker_ax - - -# =============================================================================== -# MAIN RNA MAP FUNCTION -# =============================================================================== - -def run_rna_maps(enhanced_file, silenced_file, control_file, constitutive_file, - event_info_file, xl_bed, fai, genome_fasta, window, smoothing, - output_dir, no_constitutive, no_subset, all_sites, prefix): - """ - Main function to generate RNA maps from VastDB ID lists. - """ - - log_filename, start_time, logger = setup_logging(output_dir) - - FILEname = prefix - - # Load chromosome list - df_fai = pd.read_csv(fai, sep='\t', header=None) - chroms = set(df_fai[0].values) - - # Load VastDB categories and coordinates - df_rmats = load_vastdb_categories( - enhanced_file, silenced_file, control_file, - constitutive_file, event_info_file - ) - - # Filter to valid chromosomes - df_rmats = df_rmats[df_rmats['chr'].isin(chroms)] - - # Remove exons with missing flanking coordinates - logging.info("\nFiltering exons with complete flanking coordinates...") - before = len(df_rmats) - df_rmats = df_rmats.dropna(subset=['upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE']) - after = len(df_rmats) - logging.info(f"Removed {before - after} exons with missing flanking coordinates") - logging.info(f"Remaining: {after} exons") - - exon_categories = df_rmats.groupby('category').size() - logging.info("\nExons in each category:") - logging.info(exon_categories) - - # Check for required categories - if "control" not in exon_categories or exon_categories.loc["control"] == 0: - raise ValueError("No control exons found!") - - if "enhanced" not in exon_categories and "silenced" not in exon_categories: - raise ValueError("No regulated exons found!") - - # Apply subsetting - if not no_subset: - category_counts = df_rmats['category'].value_counts() - original_counts = {cat: count for cat, count in category_counts.items()} - - np.random.seed(42) - logging.info("Random seed set to 42 for reproducible subsetting") - - target_count = 0 - if 'enhanced' in category_counts and 'silenced' in category_counts: - target_count = max(category_counts['enhanced'], category_counts['silenced']) - elif 'enhanced' in category_counts: - target_count = category_counts['enhanced'] - elif 'silenced' in category_counts: - target_count = category_counts['silenced'] - - # Subset control - if 'control' in category_counts and category_counts['control'] > target_count > 0: - control_indices = df_rmats[df_rmats['category'] == 'control'].index - control_indices_to_keep = np.random.choice(control_indices, target_count, replace=False) - drop_mask = df_rmats.index.isin(control_indices) & ~df_rmats.index.isin(control_indices_to_keep) - df_rmats = df_rmats[~drop_mask] - logging.info(f"Subsetted control exons from {category_counts['control']} to {target_count}") - - # Subset constitutive - if not no_constitutive and 'constituitive' in category_counts and category_counts['constituitive'] > target_count > 0: - const_indices = df_rmats[df_rmats['category'] == 'constituitive'].index - const_indices_to_keep = np.random.choice(const_indices, target_count, replace=False) - drop_mask = df_rmats.index.isin(const_indices) & ~df_rmats.index.isin(const_indices_to_keep) - df_rmats = df_rmats[~drop_mask] - logging.info(f"Subsetted constitutive exons from {category_counts['constituitive']} to {target_count}") - else: - logging.info("Subsetting disabled") - category_counts = df_rmats['category'].value_counts() - original_counts = {cat: count for cat, count in category_counts.items()} - - exon_categories = df_rmats.groupby('category').size() - - # Save categorized exons - output_file = os.path.join(output_dir, f"{FILEname}_VastDB_with_categories.tsv") - df_rmats[['chr', 'exonStart_0base', 'exonEnd', 'strand', 'category', 'EVENT', 'GENE', - 'upstreamES', 'upstreamEE', 'downstreamES', 'downstreamEE']].to_csv( - output_file, sep='\t', index=False - ) - logging.info(f"Saved categorized exons to {output_file}") - - # Create BED files for 6 regions - logging.info("\n" + "="*60) - logging.info("CREATING BED FILES FOR SPLICE SITES") - logging.info("="*60) - - middle_3ss_bed = get_ss_bed(df_rmats, 'exonStart_0base', 'exonEnd') - middle_5ss_bed = get_ss_bed(df_rmats, 'exonEnd', 'exonStart_0base') - downstream_3ss_bed = get_ss_bed(df_rmats, 'downstreamES', 'downstreamEE') - upstream_5ss_bed = get_ss_bed(df_rmats, 'upstreamEE', 'upstreamES') - - if all_sites: - downstream_5ss_bed = get_ss_bed(df_rmats, 'downstreamEE', 'downstreamES') - upstream_3ss_bed = get_ss_bed(df_rmats, 'upstreamES', 'upstreamEE') - - # Calculate coverage - if xl_bed is not None: - logging.info("\n" + "="*60) - logging.info("CALCULATING COVERAGE") - logging.info("="*60) - - middle_3ss = get_coverage_plot(xl_bed, middle_3ss_bed, fai, window, exon_categories, 'middle_3ss', smoothing) - middle_5ss = get_coverage_plot(xl_bed, middle_5ss_bed, fai, window, exon_categories, 'middle_5ss', smoothing) - downstream_3ss = get_coverage_plot(xl_bed, downstream_3ss_bed, fai, window, exon_categories, 'downstream_3ss', smoothing) - upstream_5ss = get_coverage_plot(xl_bed, upstream_5ss_bed, fai, window, exon_categories, 'upstream_5ss', smoothing) - - linegraph_middle_3ss = middle_3ss[0] - linegraph_middle_5ss = middle_5ss[0] - linegraph_downstream_3ss = downstream_3ss[0] - linegraph_upstream_5ss = upstream_5ss[0] - - heatmap_middle_3ss = middle_3ss[1] - heatmap_middle_5ss = middle_5ss[1] - heatmap_downstream_3ss = downstream_3ss[1] - heatmap_upstream_5ss = upstream_5ss[1] - - if not all_sites: - plotting_df = pd.concat([linegraph_upstream_5ss, linegraph_middle_3ss, - linegraph_middle_5ss, linegraph_downstream_3ss]) - heat_df = pd.concat([heatmap_upstream_5ss, heatmap_middle_3ss, - heatmap_middle_5ss, heatmap_downstream_3ss]) - else: - downstream_5ss = get_coverage_plot(xl_bed, downstream_5ss_bed, fai, window, exon_categories, 'downstream_5ss', smoothing) - upstream_3ss = get_coverage_plot(xl_bed, upstream_3ss_bed, fai, window, exon_categories, 'upstream_3ss', smoothing) - - linegraph_downstream_5ss = downstream_5ss[0] - linegraph_upstream_3ss = upstream_3ss[0] - heatmap_downstream_5ss = downstream_5ss[1] - heatmap_upstream_3ss = upstream_3ss[1] - - plotting_df = pd.concat([linegraph_middle_3ss, linegraph_middle_5ss, - linegraph_downstream_3ss, linegraph_downstream_5ss, - linegraph_upstream_3ss, linegraph_upstream_5ss]) - heat_df = pd.concat([heatmap_middle_3ss, heatmap_middle_5ss, - heatmap_downstream_3ss, heatmap_downstream_5ss, - heatmap_upstream_3ss, heatmap_upstream_3ss]) - - # Save data - plotting_df.to_csv(f'{output_dir}/{FILEname}_RNAmap.tsv', sep="\t", index=False) - - # Plot RNA maps - logging.info("\n" + "="*60) - logging.info("PLOTTING RNA MAPS") - logging.info("="*60) - - sns.set(rc={'figure.figsize':(7, 5)}) - sns.set_style("whitegrid") - - if not all_sites: - col_order = ["upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss"] - titles = ["Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS"] - col_wrap = 4 - else: - col_order = ["upstream_3ss", "upstream_5ss", "middle_3ss", "middle_5ss", "downstream_3ss", "downstream_5ss"] - titles = ["Upstream 3'SS", "Upstream 5'SS", "Middle 3'SS", "Middle 5'SS", "Downstream 3'SS", "Downstream 5'SS"] - col_wrap = 6 - - g = sns.relplot( - data=plotting_df, - x='position', - y='-log10pvalue_smoothed', - hue='name', - col='label', - facet_kws={"sharex":False}, - kind='line', - col_wrap=col_wrap, - height=5, - aspect=4/5, - col_order=col_order - ) - - for ax, title in zip(g.axes.flat, titles): - ax.set_title(title) - ax.axhline(y=0, color='k', alpha=0.2, linewidth=0.5) - fig = plt.gcf() - marker_ax = add_enrichment_marker(fig, ax) - - g.set(xlabel='') - g.axes[0].set_ylabel('-log10(p value) enrichment / control') - - sns.move_legend( - g, "upper right", - bbox_to_anchor=(1, 2), - ncol=1, - title=None, - frameon=False - ) - leg = g._legend - set_legend_text(leg, exon_categories, original_counts) - - # Add exon-intron drawings - rect_fraction = 1 / ((window + 50) / 50) - - for i, ss_type in enumerate(col_order): - ax = g.axes[i] - is_middle = ss_type.startswith('middle_') - exon_color = "midnightblue" if is_middle else "slategrey" - - if ss_type.endswith('_3ss'): - # 3' splice site: exon on right - ax.set_xlim([0, window + 50]) - ticks = np.arange(0, window + 51, 50) - labels = ["" if t in (ticks[0], ticks[-1]) else str(int(t - window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - - rect = matplotlib.patches.Rectangle( - xy=(1 - rect_fraction, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - - rect = matplotlib.patches.Rectangle( - xy=(0, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - else: - # 5' splice site: exon on left - ax.set_xlim([window - 50, window * 2]) - ticks = np.arange(window - 50, window * 2 + 1, 50) - labels = ["" if t in (ticks[0], ticks[-1]) else str(int(t - window)) for t in ticks] - ax.set_xticks(ticks) - ax.set_xticklabels(labels) - - rect = matplotlib.patches.Rectangle( - xy=(0, -0.2), width=rect_fraction, height=.1, - color=exon_color, alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - - rect = matplotlib.patches.Rectangle( - xy=(rect_fraction, -0.15), width=1 - rect_fraction, height=.001, - color="slategrey", alpha=1, - transform=ax.transAxes, clip_on=False) - ax.add_artist(rect) - - plt.subplots_adjust(wspace=0.05) - plt.savefig(f'{output_dir}/{FILEname}_RNAmap_-log10pvalue.pdf', - bbox_extra_artists=([leg, rect, marker_ax]), - bbox_inches='tight', - pad_inches=0.8) - - logging.info(f"Saved RNA map to {output_dir}/{FILEname}_RNAmap_-log10pvalue.pdf") - - pbt.helpers.cleanup() - - log_runtime(start_time, logger) - logging.info("="*60) - logging.info("SCRIPT COMPLETED SUCCESSFULLY") - logging.info("="*60) - - -# =============================================================================== -# COMMAND LINE INTERFACE -# =============================================================================== - -def main(): - parser = argparse.ArgumentParser( - description='Generate RNA maps from VastDB ID lists', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Example: - python rna_maps_vastdb_only.py \\ - --vastdb_enhanced enhanced_ids.txt \\ - --vastdb_silenced silenced_ids.txt \\ - --vastdb_control control_ids.txt \\ - --vastdb_constitutive constitutive_ids.txt \\ - --vastdb_annotation EVENT_INFO-hg38.tab \\ - -x CLIP.bed -f genome.fa -fi genome.fa.fai \\ - -o output -p PTBP1 - """ - ) - - # Required arguments - required = parser.add_argument_group('Required arguments') - required.add_argument('--vastdb_annotation', required=True, - help='VastDB EVENT_INFO file') - required.add_argument('-x', '--clip', required=True, - help='CLIP crosslinks BED file') - required.add_argument('-f', '--fasta', required=True, - help='Genome FASTA file') - required.add_argument('-fi', '--fasta_index', required=True, - help='Genome FASTA index (.fai)') - required.add_argument('-o', '--output_dir', required=True, - help='Output directory') - required.add_argument('-p', '--prefix', required=True, - help='Output file prefix') - - # VastDB ID lists - vastdb_group = parser.add_argument_group('VastDB ID lists (at least one required)') - vastdb_group.add_argument('--vastdb_enhanced', - help='Enhanced exon IDs (one per line)') - vastdb_group.add_argument('--vastdb_silenced', - help='Silenced exon IDs (one per line)') - vastdb_group.add_argument('--vastdb_control', - help='Control exon IDs (one per line)') - vastdb_group.add_argument('--vastdb_constitutive', - help='Constitutive exon IDs (one per line)') - - # Optional arguments - optional = parser.add_argument_group('Optional arguments') - optional.add_argument('-w', '--window', type=int, default=300, - help='Window around splice sites (default: 300)') - optional.add_argument('-s', '--smoothing', type=int, default=15, - help='Smoothing window (default: 15)') - optional.add_argument('-nc', '--no_constitutive', action='store_true', - help='Exclude constitutive exons') - optional.add_argument('-ns', '--no_subset', action='store_true', - help='Disable subsetting') - optional.add_argument('-ao', '--all_sites', action='store_true', - help='Include all 6 splice sites (default: 4 core sites)') - - args = parser.parse_args() - - # Validate at least one ID list provided - if not any([args.vastdb_enhanced, args.vastdb_silenced, - args.vastdb_control, args.vastdb_constitutive]): - parser.error("At least one VastDB ID list must be provided") - - # Create output directory - os.makedirs(args.output_dir, exist_ok=True) - - # Run RNA maps - run_rna_maps( - enhanced_file=args.vastdb_enhanced, - silenced_file=args.vastdb_silenced, - control_file=args.vastdb_control, - constitutive_file=args.vastdb_constitutive, - event_info_file=args.vastdb_annotation, - xl_bed=args.clip, - fai=args.fasta_index, - genome_fasta=args.fasta, - window=args.window, - smoothing=args.smoothing, - output_dir=args.output_dir, - no_constitutive=args.no_constitutive, - no_subset=args.no_subset, - all_sites=args.all_sites, - prefix=args.prefix - ) - - -if __name__ == '__main__': - main()