-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_ref_from_samplesheet.py
More file actions
executable file
Β·140 lines (109 loc) Β· 5.02 KB
/
process_ref_from_samplesheet.py
File metadata and controls
executable file
Β·140 lines (109 loc) Β· 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
import subprocess
import os
import sys
import argparse
def get_column_name(df, possible_names, field_label):
"""
Helper function to find which column name from a list exists in the DataFrame.
"""
for name in possible_names:
if name in df.columns:
return name
raise ValueError(f"Could not find a column for '{field_label}'. Tried: {possible_names}")
def process_sample_sheet(file_path, base_output_dir):
print(f"π¬ Starting processing for sample sheet: {file_path}")
print(f"π¦ Using base output directory: {base_output_dir}")
try:
# 1. Load the TSV file
df = pd.read_csv(file_path, sep='\t')
# 2. Identify the correct column names dynamically
try:
# Other columns
col_ref = get_column_name(df, ['reference', 'TGen.location.reference'], "Reference Gepnome")
col_report = get_column_name(df, ['asm_report', 'assemblyreport'], "Assembly Report")
col_assembly = get_column_name(df, ['assembly'], "Assembly ID")
print(f" Detailed Column Mapping:")
print(f" - Reference column found: '{col_ref}'")
print(f" - Report column found: '{col_report}'")
print(f" - Assembly ID column found: '{col_assembly}'")
except ValueError as e:
print(f"β Error: {e}", file=sys.stderr)
return
# 3. Drop missing rows based on the columns we just found
required_cols = [col_ref, col_report, col_assembly]
df.dropna(subset=required_cols, inplace=True)
except FileNotFoundError:
print(f"Error: File not found at {file_path}", file=sys.stderr)
return
except pd.errors.ParserError:
print(f"Error: Could not parse TSV file at {file_path}. Check file format.", file=sys.stderr)
return
# ---
## 𧬠Identify Unique References
# Select only the columns needed
unique_refs_df = df[required_cols].drop_duplicates()
commands_to_run = []
for index, row in unique_refs_df.iterrows():
# Get the values using the dynamic column names
reference = row[col_ref]
assembly_report = row[col_report]
assembly_id = row[col_assembly]
# Define the final output directory path
output_dir = os.path.join(base_output_dir, assembly_id)
# Define the final expected reference file path
expected_ref_file = os.path.join(output_dir, f"{assembly_id}.ref.fa")
# --- NEW CHECK: Skip if the final reference file already exists ---
if os.path.isfile(expected_ref_file):
print(f"β© Skipping {assembly_id}: Expected reference file already exists at {expected_ref_file}")
continue
# -------------------------------------------------------------------
# Construct the command
command = [
'./call_prep.sh',
'-r', str(reference),
'-o', str(output_dir),
'-a', str(assembly_report),
'-n', str(assembly_id)
]
commands_to_run.append(command)
generate500kb_command = [
'./generate_qc_500kb_ref.sh',
'-r', str(reference),
'-o', str(output_dir),
'-i', str(assembly_id)
]
commands_to_run.append(generate500kb_command)
# ---
## π Execute Commands
if not commands_to_run:
print("\nNo unique references found that require processing.")
return
print(f"\nFound {len(commands_to_run)} unique reference(s) to process.")
for command in commands_to_run:
command_str = " ".join(command)
print(f"\nExecuting: {command_str}")
try:
result = subprocess.run(
command,
check=True,
text=True,
capture_output=True
)
print(f" β
SUCCESS: \n{result.stdout.strip()}")
if result.stderr:
print(f" (StdErr captured: {result.stderr.strip()})")
except subprocess.CalledProcessError as e:
print(f" β ERROR: Command failed with exit code {e.returncode}.", file=sys.stderr)
print(f" Stderr: {e.stderr.strip()}", file=sys.stderr)
except FileNotFoundError:
print(f" β ERROR: 'call_prep.sh' script not found.", file=sys.stderr)
except Exception as e:
print(f" β An unexpected error occurred: {e}", file=sys.stderr)
print("\nProcessing complete.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process a TSV sample sheet to run call_prep.sh.")
parser.add_argument('-s', '--sample-sheet', type=str, required=True, help='Path to input TSV.')
parser.add_argument('-d', '--outdir', type=str, default=True, help='Reference output directory.')
args = parser.parse_args()
process_sample_sheet(args.sample_sheet, args.outdir)