From 77769827c88ccf5fd354d12da59b3ffa7a63774f Mon Sep 17 00:00:00 2001 From: Alescs Date: Fri, 23 May 2025 16:06:02 +0200 Subject: [PATCH 1/3] fix: run.py formatting and shebang --- run.py | 235 +++++++++++++++++++++++++++------------------------------ 1 file changed, 112 insertions(+), 123 deletions(-) diff --git a/run.py b/run.py index 688d8cf..d45801e 100644 --- a/run.py +++ b/run.py @@ -1,187 +1,176 @@ +#!/usr/bin/env python3 + import os import argparse import sys import time import subprocess -from subprocess import Popen, PIPE def cmd_exists(cmd): return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 +def print_msg(msg): + ts = time.strftime("%c") + print(f"{ts}: {msg}") + +def print_err(msg): + ts = time.strftime("%c") + print(f"{ts}: {msg}", file=sys.stderr) + + def main(): - cwd=os.path.dirname(os.path.abspath(__file__)) + cwd = os.path.dirname(os.path.abspath(__file__)) + print("CWD:", cwd) parser = argparse.ArgumentParser(description="MetaCarvel: A scaffolding tool for metagenomic assemblies") - parser.add_argument("-a","--assembly",help="assembled contigs",required=True) - parser.add_argument("-m","--mapping", help="mapping of read to contigs in bam format",required=True) - parser.add_argument("-d","--dir",help="output directory for results",default='out',required=True) - parser.add_argument("-r",'--repeats',help="To turn repeat detection on",default="true") - parser.add_argument("-k","--keep", help="Set this to keep temporary files in output directory",default=False) - parser.add_argument("-l","--length",help="Minimum length of contigs to consider for scaffolding in base pairs (bp)",default=500) - parser.add_argument("-b","--bsize",help="Minimum mate pair support between contigs to consider for scaffolding",default=3) - parser.add_argument("-v",'--visualization',help="Generate a .db file for the MetagenomeScope visualization tool",default=False) + parser.add_argument("-a", "--assembly", help="assembled contigs", required=True) + parser.add_argument("-m", "--mapping", help="mapping of read to contigs in bam format", required=True) + parser.add_argument("-d", "--dir", help="output directory for results", default='out', required=True) + parser.add_argument("-r", '--repeats',help="To turn repeat detection on", default="true") + parser.add_argument("-k", "--keep", help="Set this to keep temporary files in output directory", default=False) + parser.add_argument("-l", "--length", help="Minimum length of contigs to consider for scaffolding in base pairs (bp)", default=500) + parser.add_argument("-b", "--bsize", help="Minimum mate pair support between contigs to consider for scaffolding", default=3) + parser.add_argument("-v", '--visualization', help="Generate a .db file for the MetagenomeScope visualization tool", default=False) args = parser.parse_args() try: - import networkx + import networkx except ImportError: - raise ImportError('Looks like you do not have networkx. Please rerun with networkx module installed.') - sys.exit(1) + print_err('Looks like you do not have networkx. Please rerun with networkx module installed.') + sys.exit(1) + version = networkx.__version__ - version_id = int(version.split('.')[1]) - first = int(version.split('.')[0]) - print('Networkx ' + version + ' found', file=sys.stderr) - # if first != 1: - # print(time.strftime("%c")+': Networkx should be 1.11 or earlier.. Terminating...\n', file=sys.stderr) - # sys.exit(1) + print_msg(f"Networkx {version} found") + if not cmd_exists('samtools'): - print(time.strftime("%c")+': Samtools does not exist in PATH. Terminating....\n', file=sys.stderr) - sys.exit(1) + print_err('Samtools does not exist in PATH. Terminating...') + sys.exit(1) if not cmd_exists('bamToBed'): - print(time.strftime("%c")+': Bedtools does not exist in PATH. Terminating....\n', file=sys.stderr) - sys.exit(1) + print_err('Bedtools does not exist in PATH. Terminating...') + sys.exit(1) if not os.path.exists(args.dir): os.makedirs(args.dir) - print(time.strftime("%c")+':Starting scaffolding..', file=sys.stderr) - - - if os.path.exists(args.dir+'/alignment.bed') == False: - print("converting bam file to bed file", file=sys.stderr) - #os.system('bamToBed -i ' + args.mapping + " > " + args.dir+'/alignment.bed') + + print_msg('Starting scaffolding...') + if not os.path.exists(args.dir + '/alignment.bed'): + print_msg("Converting bam file to bed file") try: - p = subprocess.check_output('bamToBed -i ' + args.mapping + " > " + args.dir+'/alignment.bed', shell=True) - print('finished conversion', file=sys.stderr) + subprocess.check_output('bamToBed -i ' + args.mapping + " > " + args.dir+'/alignment.bed', shell=True) + print_msg('Finished conversion') except subprocess.CalledProcessError as err: - os.system("rm " + args.dir+'/alignment.bed') - print(time.strftime("%c")+': Failed in coverting bam file to bed format, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + os.system(f"rm {args.dir}/alignment.bed") + print_err('Failed in coverting bam file to bed format, terminating scaffolding...' + str(err.output)) + sys.exit(1) try: - #os.system('samtools faidx '+args.assembly) - p = subprocess.check_output('samtools faidx '+args.assembly,shell=True) + subprocess.check_output(f"samtools faidx {args.assembly}", shell=True) except subprocess.CalledProcessError as err: - print(str(err.output), file=sys.stderr) - sys.exit() - os.system('cut -f 1,2 '+ args.assembly+'.fai > '+args.dir+'/contig_length') - - print(time.strftime("%c")+':Finished conversion', file=sys.stderr) + print_err(str(err.output)) + sys.exit(1) - final_assembly = args.assembly - final_mapping = args.mapping + os.system(f"cut -f 1,2 {args.assembly}.fai > {args.dir}/contig_length") + print_msg('Finished conversion') - print(time.strftime("%c") + ':Started generating links between contigs', file=sys.stderr) - if os.path.exists(args.dir+'/contig_links') == False: - #print './libcorrect -l' + args.lib + ' -a' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links' + print_msg('Started generating links between contigs') + if not os.path.exists(f"{args.dir}/contig_links"): try: - #os.system('./libcorrect -l ' + args.lib + ' -a ' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links -x '+args.dir+'/contig_coverage') - p = subprocess.check_output(cwd+'/libcorrect -a ' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links -x '+args.dir+'/contig_coverage -c '+str(args.length),shell=True) - print(time.strftime("%c") +':Finished generating links between contigs', file=sys.stderr) + subprocess.check_output(f"{cwd}/libcorrect -a {args.dir}/alignment.bed \ + -d {args.dir}/contig_length -o {args.dir}/contig_links \ + -x {args.dir}/contig_coverage -c {str(args.length)}", shell=True) + print_msg('Finished generating links between contigs') except subprocess.CalledProcessError as err: - os.system('rm '+args.dir+'/contig_links') - print(time.strftime("%c")+': Failed in generate links from bed file, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + os.system(f"rm {args.dir}/contig_links") + print_err('Failed in generate links from bed file, terminating scaffolding...' + str(err.output)) sys.exit(1) - print(time.strftime("%c")+':Started bulding of links between contigs', file=sys.stderr) - if os.path.exists(args.dir+'/bundled_links') == False: + print_msg('Started bulding links between contigs') + if not os.path.exists(f"{args.dir}/bundled_links"): try: - #os.system('./bundler -l '+ args.dir+'/contig_links -o ' + args.dir+'/bundled_links + -b '+args.dir+'/bundled_graph.gml') - p = subprocess.check_output(cwd+'/bundler -l '+ args.dir+'/contig_links -o ' + args.dir+'/bundled_links + -b '+args.dir+'/bundled_graph.gml -c '+str(args.bsize), shell=True) - print(time.strftime("%c")+':Finished bundling of links between contigs', file=sys.stderr) + subprocess.check_output(f"{cwd}/bundler -l {args.dir}/contig_links \ + -o {args.dir}/bundled_links + -b {args.dir}/bundled_graph.gml \ + -c {str(args.bsize)}", shell=True) + print_msg('Finished bundling of links between contigs') except subprocess.CalledProcessError as err: - os.system('rm '+args.dir+'/bundled_links') - os.system('rm '+args.dir+'/bundled_graph.gml') - print(time.strftime("%c")+': Failed to bundle links, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + os.system(f"rm {args.dir}/bundled_links") + os.system(f"rm {args.dir}/bundled_graph.gml") + print_err('Failed to bundle links, terminating scaffolding...' + str(err.output)) + sys.exit(1) if args.repeats == "true": - print(time.strftime("%c")+':Started finding and removing repeats', file=sys.stderr) - try: - p = subprocess.check_output(cwd+'/orientcontigs -l '+args.dir+'/bundled_links -c '+ args.dir+'/contig_length --bsize -o ' +args.dir+'/oriented.gml -p ' + args.dir+'/oriented_links -i '+args.dir+'/invalidated_counts',shell=True) + print_msg('Started finding and removing repeats') + try: + subprocess.check_output(f"{cwd}/orientcontigs -l {args.dir}/bundled_links \ + -c {args.dir}/contig_length --bsize -o {args.dir}/oriented.gml \ + -p {args.dir}/oriented_links -i {args.dir}/invalidated_counts", shell=True) except subprocess.CalledProcessError as err: - print(time.strftime("%c") + ': Failed to find repeats, terminating scaffolding...\n' + str(err.output), file=sys.stderr) + print_err('Failed to find repeats, terminating scaffolding...' + str(err.output)) try: - p = subprocess.check_output('python '+cwd+'/centrality.py -g '+args.dir+'/bundled_links -l ' + args.dir+ '/contig_length -o '+args.dir+'/high_centrality.txt' ,shell=True) + subprocess.check_output(f"python {cwd}/centrality.py -g {args.dir}/bundled_links \ + -l {args.dir}/contig_length -o {args.dir}/high_centrality.txt", shell=True) except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to find repeats, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + print_err('Failed to find repeats, terminating scaffolding...' + str(err.output)) + sys.exit(1) try: - p = subprocess.check_output('python '+cwd+'/repeat_filter.py '+args.dir+'/contig_coverage ' + args.dir+ '/bundled_links ' + args.dir+'//invalidated_counts ' + args.dir+'/high_centrality.txt ' + args.dir+ '/contig_length '+ args.dir+'/repeats > ' + args.dir+'//bundled_links_filtered',shell=True) + subprocess.check_output(f"python {cwd}/repeat_filter.py {args.dir}/contig_coverage \ + {args.dir}/bundled_links {args.dir}/invalidated_counts \ + {args.dir}/high_centrality.txt {args.dir}/contig_length {args.dir}/repeats \ + > {args.dir}/bundled_links_filtered", shell=True) except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to find repeats, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + print_err('Failed to find repeats, terminating scaffolding...' + str(err.output)) sys.exit(1) - print(time.strftime("%c")+':Finished repeat finding and removal', file=sys.stderr) + + print_msg('Finished repeat finding and removal') else: - os.system('mv '+args.dir+'/bundled_links ' + args.dir+'/bundled_links_filtered') - print(time.strftime("%c")+':Started orienting the contigs', file=sys.stderr) - # if os.path.exists(args.dir+'/oriented_links') == False: - #os.system('./orientcontigs -l '+args.dir+'/bundled_links_filtered -c '+ args.dir+'/contig_length --bsize -o ' +args.dir+'/oriented.gml -p ' + args.dir+'/oriented_links' ) + os.system(f"mv {args.dir}/bundled_links {args.dir}/bundled_links_filtered") + + print_msg('Started orienting the contigs') try: - p = subprocess.check_output(cwd+'/orientcontigs -l '+args.dir+'/bundled_links_filtered -c '+ args.dir+'/contig_length --bsize -o ' +args.dir+'/oriented.gml -p ' + args.dir+'/oriented_links -i '+args.dir+'/invalidated_counts',shell=True) - print(time.strftime("%c")+':Finished orienting the contigs', file=sys.stderr) + subprocess.check_output(f"{cwd}/orientcontigs -l {args.dir}/bundled_links_filtered -c {args.dir}/contig_length \ + --bsize -o {args.dir}/oriented.gml -p {args.dir}/oriented_links \ + -i {args.dir}/invalidated_counts", shell=True) + print_msg('Finished orienting the contigs') except subprocess.CalledProcessError: - print(time.strftime("%c")+': Failed to Orient contigs, terminating scaffolding....', file=sys.stderr) + print_err('Failed to orient the contigs, terminating scaffolding...') - print(time.strftime("%c")+':Started finding separation pairs', file=sys.stderr) - #if os.path.exists(args.dir+'/seppairs') == False: - #os.system('./spqr -l ' + args.dir+'/oriented_links -o ' + args.dir+'/seppairs') + print_msg('Started finding separation pairs') try: - p = subprocess.check_output(cwd+'/spqr -l ' + args.dir+'/oriented_links -o ' + args.dir+'/seppairs',shell=True) - print(time.strftime("%c")+':Finished finding spearation pairs', file=sys.stderr) + subprocess.check_output(f"{cwd}/spqr -l {args.dir}/oriented_links -o {args.dir}/seppairs", shell=True) + print_err('Finished finding spearation pairs') except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to decompose graph, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + print_err('Failed to decompose graph, terminating scaffolding...' + str(err.output)) sys.exit(1) - print(time.strftime("%c")+':Finding the layout of contigs', file=sys.stderr) - if os.path.exists(args.dir+'/scaffolds.fasta') == False: + print_msg('Finding the layout of contigs') + if not os.path.exists(f"{args.dir}/scaffolds.fasta"): try: - p = subprocess.check_output('python '+cwd+'/layout.py -a '+ args.assembly +' -b '+args.dir+'/bubbles.txt' +' -g ' + args.dir+'/oriented.gml -s '+args.dir+'/seppairs -o '+args.dir+'/scaffolds.fa -f '+args.dir+'/scaffolds.agp -e '+args.dir+'/scaffold_graph.gfa',shell=True) - print(time.strftime("%c")+':Final scaffolds written, Done!', file=sys.stderr) + subprocess.check_output(f"python {cwd}/layout.py -a {args.assembly} -b {args.dir}/bubbles.txt \ + -g {args.dir}/oriented.gml -s {args.dir}/seppairs -o {args.dir}/scaffolds.fa \ + -f {args.dir}/scaffolds.agp -e {args.dir}/scaffold_graph.gfa", shell=True) + print_msg('Final scaffolds written, Done!') except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to generate scaffold sequences, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + print_err('Failed to generate scaffold sequences, terminating scaffolding...' + str(err.output)) if args.visualization == "true": - #try: - graphpath = os.path.abspath(args.dir+'/oriented.gml') - bubblepath = os.path.abspath(args.dir+'/bubbles.txt') - # Output the MetagenomeScope .db file directly to args.dir. The only file - # created by collate.py here is the mgsc.db file. - os.system('python '+cwd+'/MetagenomeScope/graph_collator/collate.py -i ' - + graphpath + ' -w -ub ' + bubblepath + ' -ubl -d ' + args.dir - + ' -o mgsc') - #p = subprocess.check_output('python '+cwd+'/MetagenomeScope/graph_collator/collate.py -i ' + graphpath + ' -w -ub ' + bubblepath + ' -ubl -d ' + args.dir + ' -o mgsc') - #except subprocess.CalledProcessError as err: - #print >> sys.stderr, time.strftime("%c")+": Failed to run MetagenomeScope \n" + str(err.output) + graphpath = os.path.abspath(f"{args.dir}/oriented.gml") + bubblepath = os.path.abspath(f"{args.dir}/bubbles.txt") + # Output the MetagenomeScope .db file directly to args.dir. The only file + # created by collate.py here is the mgsc.db file. + os.system(f"python {cwd}/MetagenomeScope/graph_collator/collate.py -i {graphpath} -w -ub \ + {bubblepath} -ubl -d {args.dir} -o mgsc") if not args.keep == "true": - if os.path.exists(args.dir+'/contig_length'): - os.system("rm "+args.dir+'/contig_length') - if os.path.exists(args.dir+'/contig_links'): - os.system("rm "+args.dir+'/contig_links') - if os.path.exists(args.dir+'/contig_coverage'): - os.system("rm "+args.dir+'/contig_coverage') - if os.path.exists(args.dir+'/bundled_links'): - os.system("rm "+args.dir+'/bundled_links') - if os.path.exists(args.dir+'/bundled_links_filtered'): - os.system("rm "+args.dir+'/bundled_links_filtered') - if os.path.exists(args.dir+'/bundled_graph.gml'): - os.system("rm "+args.dir+'/bundled_graph.gml') - if os.path.exists(args.dir+'/invalidated_counts'): - os.system("rm "+args.dir+'/invalidated_counts') - if os.path.exists(args.dir+'/repeats'): - os.system("rm "+args.dir+'/repeats') - if os.path.exists(args.dir+'/oriented_links'): - os.system("rm "+args.dir+'/oriented_links') - if os.path.exists(args.dir+'/oriented.gml'): - os.system("rm "+args.dir+'/oriented.gml') - if os.path.exists(args.dir+'/seppairs'): - os.system("rm "+args.dir+'/seppairs') - if os.path.exists(args.dir+'/alignment.bed'): - os.system("rm "+args.dir+'/alignment.bed') + for fname in ['contig_length', 'contig_links', 'contig_coverage', 'bundled_links', + 'bundled_links_filtered', 'bundled_graph.gml', 'invalidated_counts', + 'repeats', 'oriented_links', 'oriented.gml', 'seppairs', 'alignment.bed']: + path = os.path.join(args.dir, fname) + if os.path.exists(path): + os.system(f"rm {path}") + if __name__ == '__main__': main() From cb377c5084bf53870a8632cb84504c3ee088bcb0 Mon Sep 17 00:00:00 2001 From: Alescs Date: Fri, 23 May 2025 16:08:48 +0200 Subject: [PATCH 2/3] fix: OGDF/makeMakefile.py syntax errors on python3 --- OGDF/makeMakefile.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/OGDF/makeMakefile.py b/OGDF/makeMakefile.py index 9c61506..114fed3 100755 --- a/OGDF/makeMakefile.py +++ b/OGDF/makeMakefile.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2.7 +#!/usr/bin/env python3 # Make Makefile # # October 2012 @@ -278,7 +278,7 @@ def Walk(curdir): for v in versions: # print target&depend: add full path spec, incl. version & ignore extra line path = v.call() + '/' +fullname[:-len(name)] - makefile.write(path + targetAndDepend[:-1] + '\n') + makefile.write(path + targetAndDepend[:-1].decode("utf-8") + '\n') # ensure folder makefile.write('\t$(MKDIR) ' + v.call() + '/' + fullname[:-len(name)-1] + '\n') @@ -469,5 +469,3 @@ def InstallHeaders(curdir, makefile, installPrefix): makefile.close() print('Makefile generated') - - From 564279347ea43e92c213e9fcf17a918ba09d2656 Mon Sep 17 00:00:00 2001 From: Alescs Date: Fri, 23 May 2025 16:12:15 +0200 Subject: [PATCH 3/3] feat: correct installation instructions in README.md --- README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 94895bf..7eccb08 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,25 @@ MetaCarvel is an updated version of previous metagenome scaffolder Bambus 2. To You can install Networkx as described [here](https://pypi.org/project/networkx/). MetCarvel can work with the latest NetworkX version 2.5 -Briefly, you need to run following: -``` + +Briefly, you need to run the following: +```bash +# Install pip dependencies (ensure you also have samtools and bedtools installed) pip install numpy (tested with version 1.20) pip install networkx>=2.5 + +# Clone the repository +git clone https://github.com/marbl/MetaCarvel +cd MetaCarvel/ +git submodule init +git submodule update --recursive + +# Build OGDF +cd OGDF/ +python makeMakefile.py +make +cd .. +make ``` ## The detailed documentation and tutorial to install and run MetaCarvel can be found on [Wiki](https://github.com/marbl/MetaCarvel/wiki).