-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_test_sequence.py
More file actions
executable file
·84 lines (57 loc) · 2.57 KB
/
check_test_sequence.py
File metadata and controls
executable file
·84 lines (57 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
Created on Mar 9, 2016
@author: Igor
The script will check that the fasta sequence is apropiate for downstream analysis
Check that the sequence only contains ATGC nucleotides
Return Uper case sequences
Usage:
python check_test_sequence fastaIN.fasta fastaOUT.fasta fastaREJECTED.fasta"
'''
import sys
from Bio import SeqIO
## Function check that the sequence only contains ATGC nucleotides
dna = set("ATGC")
def validate(seq, alphabet=dna):
"Checks that a sequence only contains values from an alphabet"
leftover = set(seq.upper()) - alphabet
return not leftover
# using it with other alphabets
#prot = set('ACDEFGHIKLMNPQRSTVWY')
#print validate("mglsdgewql", alphabet=prot)
## Function split sequences in valid and rejected. Also return sequence in CAPITAL
def check_fasta(fin_fname, fout_fname, fout_rejected):
fin = open(fin_fname, "rt")
fout = open(fout_fname, "w")
frejected = open(fout_rejected, "w")
for seq_record in SeqIO.parse(fin, "fasta"):
header = seq_record.id
seq = seq_record.seq.upper()
#print(seq_record.id)
#print(repr(seq_record.seq.upper()))
#print seq_record.seq.upper()
sequence = seq_record.seq.upper()
#print(len(seq_record))
if validate(sequence):
#print sequence
# Write new fasta + the header
fout.write(">"+str(header) + '\n')
fout.write(str(seq) + '\n')
else:
## Reject sequences print o screen and save to rejected.fata file
print "NNNNNNNNNNNNNNN " + sequence
frejected.write(">"+str(header) + '\n')
frejected.write(str(seq) + '\n')
# Argumet passed to this script when called
if sys.argv.__len__() ==4:
fin_fname = sys.argv[1]
fout_fname = sys.argv[2]
fout_rejected = sys.argv[3]
check_fasta(fin_fname, fout_fname, fout_rejected)
else:
#print str(sys.argv.__len__())
print "error:\t4 \n Usage: \t python check_test_sequence fastaIN fastaOUT fastaREJECTED"
'''
fin_fname = '/media/igor/DATA/UCL/Evolution_Alus/LiftOver_bedPositions/All_Aluexons_3SS_20_3_corrected/test/All_Aluexons_3SS_20_3_corrected_hg38_to_rheMac3.fasta'
fout_fname = '/media/igor/DATA/UCL/Evolution_Alus/LiftOver_bedPositions/All_Aluexons_3SS_20_3_corrected/test/All_Aluexons_3SS_20_3_corrected_hg38_to_rheMac3_TEST_OUT_DELETE.fasta'
fout_rejected = '/media/igor/DATA/UCL/Evolution_Alus/LiftOver_bedPositions/All_Aluexons_3SS_20_3_corrected/test/All_Aluexons_3SS_20_3_corrected_hg38_to_rheMac3_TEST_OUT_REJECTED_DELETE.fasta'
'''