diff --git a/python_scripts/DIAMOND_subsystems_analysis_counter.py b/python_scripts/DIAMOND_subsystems_analysis_counter.py index b4460b2..5bd8506 100644 --- a/python_scripts/DIAMOND_subsystems_analysis_counter.py +++ b/python_scripts/DIAMOND_subsystems_analysis_counter.py @@ -30,8 +30,6 @@ # # -I infile specifies the infile (a DIAMOND results file # in m8 format) -# -D database specifies a reference database to search against -# for results # -O outfile specifies a name for the outfile (otherwise defaults # to $name_hierarchy.tsv) # -P partial partial outfile; a list of all reads with their @@ -40,7 +38,7 @@ ########################################################################## # imports -import operator, sys, time, gzip, re +import operator, sys, gzip, re # String searching function: def string_find(usage_term): @@ -48,9 +46,7 @@ def string_find(usage_term): this_elem = elem next_elem = sys.argv[(idx + 1) % len(sys.argv)] if elem == usage_term: - return next_elem - -t0 = time.time() + return next_elem # loading starting file if "-I" in sys.argv: @@ -66,13 +62,12 @@ def string_find(usage_term): read_id_db = {} line_counter = 0 +print ("\nNow reading through the SEED m8 results infile.") + # reading through the infile for line in infile: line_counter += 1 splitline = line.split("\t") - if line_counter % 1000000 == 0: - t99 = time.time() - print (str(line_counter)[:-6] + "M lines processed so far in " + str(t99-t0) + " seconds.") unique_seq_db[splitline[0]] = 1 @@ -85,28 +80,21 @@ def string_find(usage_term): hit_count_db[splitline[1]] = 1 continue -t1 = time.time() + # results reporting print ("\nAnalysis of " + infile_name + " complete.") print ("Number of total lines: " + str(line_counter)) print ("Number of unique sequences: " + str(len(unique_seq_db))) -print ("Time elapsed: " + str(t1-t0) + " seconds.") infile.close() -# time to search for these in the reference database -if "-D" in sys.argv: - db_name = string_find("-D") -else: - sys.exit( "No database file indicated; skipping database search step.") - # IO -try: - db = open (db_name, "r", encoding='utf-8', errors='ignore') -except TypeError: - # error catching for Python 2 - db = open (db_name, "r") +# try: +db = open (infile_name, "r", encoding='utf-8', errors='ignore') +# except TypeError: +# # error catching for Python 2 +# db = open (db_name, "r") if "-P" in sys.argv: partial_outfile_name = string_find("-P") @@ -114,43 +102,34 @@ def string_find(usage_term): print ("\nStarting database analysis now.") -t2 = time.time() - # building a dictionary of the reference database db_hier_dictionary = {} db_line_counter = 0 db_error_counter = 0 for line in db: - if line.startswith(">") == True: - db_line_counter += 1 - splitline = line.split("\t", 1) - - # ID, the hit returned in DIAMOND results - db_id = str(splitline[0])[1:] + line = '\t'.join(line.split("\t")[12:]) - # name and functional description - if "NO HIERARCHY" in splitline[1]: - db_hier = "NO HIERARCHY" - else: - hier_split = splitline[1].split("\t") - if hier_split[3].strip() != "": - db_hier = hier_split[0] + "\t" + hier_split[1] + "\t" + hier_split[2] + "\t" + hier_split[3] - else: - db_hier = hier_split[0] + "\t" + hier_split[1] + "\t\t" + hier_split[2] + "\t" + hier_split[3] + db_line_counter += 1 + splitline = line.split("\t", 1) - # add to dictionaries - db_hier_dictionary[db_id] = db_hier + # ID, the hit returned in DIAMOND results + db_id = str(splitline[0]) # [1:] - # line counter to show progress - if db_line_counter % 1000000 == 0: # each million - t95 = time.time() - print (str(db_line_counter) + " lines processed so far in " + str(t95-t2) + " seconds.") + # name and functional description + if "NO HIERARCHY" in splitline[1]: + db_hier = "NO HIERARCHY" + else: + hier_split = splitline[1].split("\t") + if hier_split[3].strip() != "": + db_hier = hier_split[0] + "\t" + hier_split[1] + "\t" + hier_split[2] + "\t" + hier_split[3] + else: + db_hier = hier_split[0] + "\t" + hier_split[1] + "\t\t" + hier_split[2] + "\t" + hier_split[3] -t3 = time.time() + # add to dictionaries + db_hier_dictionary[db_id] = db_hier print ("\nSuccess!") -print ("Time elapsed: " + str(t3-t2) + " seconds.") print ("Number of lines: " + str(db_line_counter)) print ("Number of errors: " + str(db_error_counter)) @@ -171,7 +150,6 @@ def string_find(usage_term): # dictionary output and summary print ("\nDictionary database assembled.") -print ("Time elapsed: " + str(t3-t2) + " seconds.") print ("Number of errors: " + str(db_error_counter)) print ("\nTop ten hierarchy matches:") diff --git a/python_scripts/standardized_DIAMOND_analysis_counter.py b/python_scripts/standardized_DIAMOND_analysis_counter.py index b2925b9..1a8fe9c 100755 --- a/python_scripts/standardized_DIAMOND_analysis_counter.py +++ b/python_scripts/standardized_DIAMOND_analysis_counter.py @@ -30,8 +30,6 @@ # # -I infile specifies the infile (a DIAMOND results file # in m8 format) -# -D database specifies a reference database to search against -# for results # -O organism returns organism results # -F function returns functional results # -SO specific org creates a separate outfile for results that hit @@ -48,9 +46,7 @@ def string_find(usage_term): this_elem = elem next_elem = sys.argv[(idx + 1) % len(sys.argv)] if elem == usage_term: - return next_elem - -t0 = time.time() + return next_elem # checking for an option (organism or function) to be specified if "-O" not in sys.argv: @@ -63,13 +59,6 @@ def string_find(usage_term): else: sys.exit ("WARNING: infile must be specified using '-I' flag.") -# checking to make sure database is specified -if "-D" in sys.argv: - db_name = string_find("-D") -else: - sys.exit( "No database file indicated; skipping database search step.") - - infile = open (infile_name, "r") # setting up databases @@ -83,9 +72,8 @@ def string_find(usage_term): for line in infile: line_counter += 1 splitline = line.split("\t") - if line_counter % 1000000 == 0: - t99 = time.time() - print (str(line_counter)[:-6] + "M lines processed so far in " + str(t99-t0) + " seconds.") + # if line_counter % 1000000 == 0: + # print (str(line_counter)[:-6] + "M lines processed so far in " + str(t99-t0) + " seconds.") unique_seq_db[splitline[0]] = 1 @@ -95,22 +83,18 @@ def string_find(usage_term): RefSeq_hit_count_db[splitline[1]] = 1 continue -t1 = time.time() print ("\nAnalysis of " + infile_name + " complete.") print ("Number of total lines: " + str(line_counter)) print ("Number of unique sequences: " + str(len(unique_seq_db))) -print ("Time elapsed: " + str(t1-t0) + " seconds.") infile.close() # time to search for these in the reference database -db = open (db_name, "r") +db = open (infile_name, "r") print ("\nStarting database analysis now.") -t2 = time.time() - # optional outfile of specific organism results # if "-SO" in sys.argv: # target_org = string_find("-SO") @@ -124,18 +108,29 @@ def string_find(usage_term): db_line_counter = 0 db_error_counter = 0 +# Reusing db code for parsing diamond file with additional salltitles column +db = open (infile_name, "r") + for line in db: - if line.startswith(">") == True: + if line.startswith(">") == False: + line = line.strip().split("\t")[-1] + multispecies = False + if "MULTISPECIES:" in line: + multispecies = True + line = line.replace("MULTISPECIES: ","").strip() + db_line_counter += 1 splitline = line.split("[",1) - # ID, the hit returned in DIAMOND results - db_id = str(splitline[0].split()[0])[1:] + db_id = str(splitline[0].split()[0]) # [1:] # Not needed anymore # name and functional description db_entry = line.split("[", 1) + # 1 ['WP_168619497.1 ATP-binding cassette domain-containing protein, partial ', 'Rhizobium ruizarguesonis]'] db_entry = db_entry[0].split(" ", 1) + # 2 ['WP_168619497.1', 'ATP-binding cassette domain-containing protein, partial '] db_entry = db_entry[1][:-1] + # 3 ATP-binding cassette domain-containing protein, partial # organism name if line.count("[") != 1: @@ -164,8 +159,11 @@ def string_find(usage_term): db_org = str(db_org[0]) + " " + str(db_org[1]) except IndexError: db_org = line.strip().split("[", 1) - db_org = db_org[1][:-1] - db_error_counter += 1 + # Adding sp. to group with other unknown species of the same genus + db_org = db_org[1][:-1] + " sp." + # Multispecies not counted in the error counter + if not multispecies: + db_error_counter += 1 db_org = re.sub('[^a-zA-Z0-9-_*. ]', '', db_org) @@ -177,16 +175,10 @@ def string_find(usage_term): if "-SO" in sys.argv: if target_org in db_org: db_SO_dictionary[db_id] = db_entry - - # line counter to show progress - if db_line_counter % 1000000 == 0: # each million - t95 = time.time() - print (str(db_line_counter)[:-6] + "M lines processed so far in " + str(t95-t2) + " seconds.") - -t3 = time.time() + else: + print("ELSE SELSLELELSL") print ("\nSuccess!") -print ("Time elapsed: " + str(t3-t2) + " seconds.") print ("Number of lines: " + str(db_line_counter)) print ("Number of errors: " + str(db_error_counter)) @@ -221,7 +213,6 @@ def string_find(usage_term): # dictionary output and summary print ("\nDictionary database assembled.") -print ("Time elapsed: " + str(t3-t2) + " seconds.") print ("Number of errors: " + str(db_error_counter)) if "-O" in sys.argv: