transcript · bartns · Jun 16, 2021 · Jun 16, 2021
diff --git a/python_scripts/DIAMOND_subsystems_analysis_counter.py b/python_scripts/DIAMOND_subsystems_analysis_counter.py
@@ -30,8 +30,6 @@
 #
 # -I		infile			specifies the infile (a DIAMOND results file
 #								in m8 format)
-# -D		database		specifies a reference database to search against
-#								for results
 # -O		outfile			specifies a name for the outfile (otherwise defaults
 #								to $name_hierarchy.tsv)
 # -P 		partial			partial outfile; a list of all reads with their
@@ -40,17 +38,15 @@
 ##########################################################################
 
 # imports
-import operator, sys, time, gzip, re
+import operator, sys, gzip, re
 
 # String searching function:
 def string_find(usage_term):
 	for idx, elem in enumerate(sys.argv):
 		this_elem = elem
 		next_elem = sys.argv[(idx + 1) % len(sys.argv)]
 		if elem == usage_term:
-			 return next_elem
-
-t0 = time.time()
+			return next_elem
 
 # loading starting file
 if "-I" in sys.argv:
@@ -66,13 +62,12 @@ def string_find(usage_term):
 read_id_db = {}
 line_counter = 0
 
+print ("\nNow reading through the SEED m8 results infile.")
+
 # reading through the infile
 for line in infile:
 	line_counter += 1
 	splitline = line.split("\t")
-	if line_counter % 1000000 == 0:
-		t99 = time.time()
-		print (str(line_counter)[:-6] + "M lines processed so far in " + str(t99-t0) + " seconds.")
 
 	unique_seq_db[splitline[0]] = 1
 
@@ -85,72 +80,56 @@ def string_find(usage_term):
 		hit_count_db[splitline[1]] = 1
 		continue
 
-t1 = time.time()
+
 
 # results reporting
 print ("\nAnalysis of " + infile_name + " complete.")
 print ("Number of total lines: " + str(line_counter))
 print ("Number of unique sequences: " + str(len(unique_seq_db)))
-print ("Time elapsed: " + str(t1-t0) + " seconds.")
 
 infile.close()
 
-# time to search for these in the reference database
-if "-D" in sys.argv:
-	db_name = string_find("-D")
-else:
-	sys.exit( "No database file indicated; skipping database search step.")
-
 # IO
-try:
-	db = open (db_name, "r", encoding='utf-8', errors='ignore')
-except TypeError:
-	# error catching for Python 2
-	db = open (db_name, "r")
+# try:
+db = open (infile_name, "r", encoding='utf-8', errors='ignore')
+# except TypeError:
+# 	# error catching for Python 2
+# 	db = open (db_name, "r")
 
 if "-P" in sys.argv:
 	partial_outfile_name = string_find("-P")
 	partial_outfile = open(partial_outfile_name, "w")
 
 print ("\nStarting database analysis now.")
 
-t2 = time.time()
-
 # building a dictionary of the reference database
 db_hier_dictionary = {}
 db_line_counter = 0
 db_error_counter = 0
 
 for line in db:
-	if line.startswith(">") == True:
-		db_line_counter += 1
-		splitline = line.split("\t", 1)
-
-		# ID, the hit returned in DIAMOND results
-		db_id = str(splitline[0])[1:]
+	line = '\t'.join(line.split("\t")[12:])
 
-		# name and functional description
-		if "NO HIERARCHY" in splitline[1]:
-			db_hier = "NO HIERARCHY"
-		else:
-			hier_split = splitline[1].split("\t")
-			if hier_split[3].strip() != "":
-				db_hier = hier_split[0] + "\t" + hier_split[1] + "\t" + hier_split[2] + "\t" + hier_split[3]
-			else:
-				db_hier = hier_split[0] + "\t" + hier_split[1] + "\t\t" + hier_split[2] + "\t" + hier_split[3]
+	db_line_counter += 1
+	splitline = line.split("\t", 1)
 
-		# add to dictionaries
-		db_hier_dictionary[db_id] = db_hier
+	# ID, the hit returned in DIAMOND results
+	db_id = str(splitline[0]) # [1:]
 
-		# line counter to show progress
-		if db_line_counter % 1000000 == 0:							# each million
-			t95 = time.time()
-			print (str(db_line_counter) + " lines processed so far in " + str(t95-t2) + " seconds.")
+	# name and functional description
+	if "NO HIERARCHY" in splitline[1]:
+		db_hier = "NO HIERARCHY"
+	else:
+		hier_split = splitline[1].split("\t")
+		if hier_split[3].strip() != "":
+			db_hier = hier_split[0] + "\t" + hier_split[1] + "\t" + hier_split[2] + "\t" + hier_split[3]
+		else:
+			db_hier = hier_split[0] + "\t" + hier_split[1] + "\t\t" + hier_split[2] + "\t" + hier_split[3]
 
-t3 = time.time()
+	# add to dictionaries
+	db_hier_dictionary[db_id] = db_hier
 
 print ("\nSuccess!")
-print ("Time elapsed: " + str(t3-t2) + " seconds.")
 print ("Number of lines: " + str(db_line_counter))
 print ("Number of errors: " + str(db_error_counter))
 
@@ -171,7 +150,6 @@ def string_find(usage_term):
 
 # dictionary output and summary
 print ("\nDictionary database assembled.")
-print ("Time elapsed: " + str(t3-t2) + " seconds.")
 print ("Number of errors: " + str(db_error_counter))
 
 print ("\nTop ten hierarchy matches:")

diff --git a/python_scripts/standardized_DIAMOND_analysis_counter.py b/python_scripts/standardized_DIAMOND_analysis_counter.py
@@ -30,8 +30,6 @@
 #
 # -I		infile			specifies the infile (a DIAMOND results file
 #								in m8 format)
-# -D		database		specifies a reference database to search against
-#								for results
 # -O		organism		returns organism results
 # -F		function		returns functional results
 # -SO		specific org	creates a separate outfile for results that hit
@@ -48,9 +46,7 @@ def string_find(usage_term):
 		this_elem = elem
 		next_elem = sys.argv[(idx + 1) % len(sys.argv)]
 		if elem == usage_term:
-			 return next_elem
-
-t0 = time.time()
+			return next_elem
 
 # checking for an option (organism or function) to be specified
 if "-O" not in sys.argv:
@@ -63,13 +59,6 @@ def string_find(usage_term):
 else:
 	sys.exit ("WARNING: infile must be specified using '-I' flag.")
 
-# checking to make sure database is specified
-if "-D" in sys.argv:
-	db_name = string_find("-D")
-else:
-	sys.exit( "No database file indicated; skipping database search step.")
-
-
 infile = open (infile_name, "r")
 
 # setting up databases
@@ -83,9 +72,8 @@ def string_find(usage_term):
 for line in infile:
 	line_counter += 1
 	splitline = line.split("\t")
-	if line_counter % 1000000 == 0:
-		t99 = time.time()
-		print (str(line_counter)[:-6] + "M lines processed so far in " + str(t99-t0) + " seconds.")
+	# if line_counter % 1000000 == 0:
+	# 	print (str(line_counter)[:-6] + "M lines processed so far in " + str(t99-t0) + " seconds.")
 
 	unique_seq_db[splitline[0]] = 1
 
@@ -95,22 +83,18 @@ def string_find(usage_term):
 		RefSeq_hit_count_db[splitline[1]] = 1
 		continue
 
-t1 = time.time()
 
 print ("\nAnalysis of " + infile_name + " complete.")
 print ("Number of total lines: " + str(line_counter))
 print ("Number of unique sequences: " + str(len(unique_seq_db)))
-print ("Time elapsed: " + str(t1-t0) + " seconds.")
 
 infile.close()
 
 # time to search for these in the reference database
-db = open (db_name, "r")
+db = open (infile_name, "r")
 
 print ("\nStarting database analysis now.")
 
-t2 = time.time()
-
 # optional outfile of specific organism results
 # if "-SO" in sys.argv:
 # 	target_org = string_find("-SO")
@@ -124,18 +108,29 @@ def string_find(usage_term):
 db_line_counter = 0
 db_error_counter = 0
 
+# Reusing db code for parsing diamond file with additional salltitles column
+db = open (infile_name, "r")
+
 for line in db:
-	if line.startswith(">") == True:
+	if line.startswith(">") == False:
+		line = line.strip().split("\t")[-1]
+		multispecies = False
+		if "MULTISPECIES:" in line:
+			multispecies = True
+			line = line.replace("MULTISPECIES: ","").strip()
+
 		db_line_counter += 1
 		splitline = line.split("[",1)
-
 		# ID, the hit returned in DIAMOND results
-		db_id = str(splitline[0].split()[0])[1:]
+		db_id = str(splitline[0].split()[0]) # [1:] # Not needed anymore
 
 		# name and functional description
 		db_entry = line.split("[", 1)
+		# 1 ['WP_168619497.1 ATP-binding cassette domain-containing protein, partial ', 'Rhizobium ruizarguesonis]']
 		db_entry = db_entry[0].split(" ", 1)
+		# 2 ['WP_168619497.1', 'ATP-binding cassette domain-containing protein, partial ']
 		db_entry = db_entry[1][:-1]
+		# 3 ATP-binding cassette domain-containing protein, partial
 
 		# organism name
 		if line.count("[") != 1:
@@ -164,8 +159,11 @@ def string_find(usage_term):
 				db_org = str(db_org[0]) + " " + str(db_org[1])
 			except IndexError:
 				db_org = line.strip().split("[", 1)
-				db_org = db_org[1][:-1]
-				db_error_counter += 1
+				# Adding sp. to group with other unknown species of the same genus
+				db_org = db_org[1][:-1] + " sp."
+				# Multispecies not counted in the error counter
+				if not multispecies:
+					db_error_counter += 1
 
 		db_org = re.sub('[^a-zA-Z0-9-_*. ]', '', db_org)
 
@@ -177,16 +175,10 @@ def string_find(usage_term):
 		if "-SO" in sys.argv:
 			if target_org in db_org:
 				db_SO_dictionary[db_id] = db_entry
-
-		# line counter to show progress
-		if db_line_counter % 1000000 == 0:					# each million
-			t95 = time.time()
-			print (str(db_line_counter)[:-6] + "M lines processed so far in " + str(t95-t2) + " seconds.")
-
-t3 = time.time()
+	else:
+		print("ELSE SELSLELELSL")
 
 print ("\nSuccess!")
-print ("Time elapsed: " + str(t3-t2) + " seconds.")
 print ("Number of lines: " + str(db_line_counter))
 print ("Number of errors: " + str(db_error_counter))
 
@@ -221,7 +213,6 @@ def string_find(usage_term):
 
 # dictionary output and summary
 print ("\nDictionary database assembled.")
-print ("Time elapsed: " + str(t3-t2) + " seconds.")
 print ("Number of errors: " + str(db_error_counter))
 
 if "-O" in sys.argv: