-
Notifications
You must be signed in to change notification settings - Fork 0
Trimming and Post Processing Small RNA
chapmana edited this page Oct 11, 2012
·
1 revision
Sequenced 50 nt single-end reads were trimmed of their adapters/primers and trimmed for quality. Reads shorter than 16 nt were discarded. Collapsed fasta files were created from the resulting fastq file. Sample entries:
>H1_GACGAC:4978:27 AAACGAGAACTTTGAA >H1_GACGAC:4979:17 AAACGAGAACTTTGAAGThis says that sequence number 4978 in the SHRH1 sample has sequence AAACGAGAACTTTGAA and is represented by 27 reads. The individual sample files were assembled into a single fasta file using this script:
#! /usr/bin/python
import sys, re, os, subprocess, collections
#####################################################################
## concatenate fasta files from different small RNA sample runs
#####################################################################
if (len(sys.argv) < 2 ):
print("Sample Call:")
print("python3 geneRefFile genomeRefFile variantBedFile makeReferences")
sys.exit()
fin = open(sys.argv[1],'r')
fout = open(sys.argv[2], 'w')
seqDict = collections.defaultdict(str)
while True:
readName = fin.readline()
if not readName:
break ## EOF
seq = fin.readline()
readName = readName.rstrip()
seq = seq.rstrip()
seqDict[seq] = ''.join([seqDict[seq], readName]) ## thus the names are separated by '>'
for seq in seqDict:
fout.writelines(''.join([seqDict[seq], "\n"]))
fout.writelines(''.join([seq, "\n"]))
fin.close()
fout.close()>Lx1_CTAGCT:4995:20>Lx2_CTATAC:26217:131>Lx3_CTCAGA:7223:23>H1_GACGAC:4978:27>H2_TAATCG:4781:16 AAACGAGAACTTTGAAwhere sample BNLx1 had 20 reads, BNLx2 had 131 reads, etc. The fasta file was converted to a fastq file with good but not perfect score sequences, via something like this:
awk '{getline seq; print "@"$0; print seq; print "+"; for (i=1; i<=length(seq); i++)rintf "D"; printf "\n" }' All_Samples.concat.fa > All_Samples.concat.fqnohup bowtie2 --phred33 -p 20 -k 1000 --end-to-end --very-sensitive -x /data/helicos/reference/rn5/noRandom/rn5_No_Random -U All_Samples.concat.fq -S All_Samples.concat.sam
#### very sensitive means: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50
## see bowtie2 manual for details>Lx2_CTATAC:958106:2>H2_TAATCG:235177:1 272 chr10 43533060 16 4M1I41M * 0 0 AAGCCCTACTTCCCTGACCGGGAAACGAACCCGGGCCGCGGCGGTG * AS:i:-23 XS:i:-20 XN:i:0 XM:i:3 XO:i:1 XG:i:1 NM:i:4 MD:Z:3T0G18T21 YT:Z:UUawk 'NR>23{print $3"\t"$4-1"\t"$4+length($10)}' All_Samples.concat.sam #! /usr/bin/python
import sys, re, os, subprocess, collections
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import *
################################
################################
## parse sam file output for small rna data
## sam file has had header removed and does not include non-aligned reads:
## awk 'NR>23{if ($2!=4) print }' All_Samples.concat.sam > All_Aligned.concat.sam
##################################################################################################
# use this as a sequence to map an encoded operation to the appropriate
# character
DECODE = 'MIDNSHP'
# this dictionary maps operations to their integer encodings
_ENCODE = dict((c,i) for (i,c) in enumerate(DECODE))
###*******************************************************************************
def parse(cigar):
"""
Parse CIGAR string and return a list of pysam-encoded tuples.
MIDNSHP => 0123456
>>> parse("3S17M8D4M9I3H")
[(4, 3), (0, 17), (2, 8), (0, 4), (1, 9), (5, 3)]
"""
result = []
n = ''
for c in cigar:
if c.isdigit():
n += c
elif c in _ENCODE:
if n == '':
raise ValueError("end of CIGAR string reached, but an operator was expected")
result.append( (_ENCODE[c], int(n)) )
n = ''
return result
###*******************************************************************************
def findLength(seegar):
"""
find length of alignment, given parsed cigar list
"""
length = 0
for item in seegar:
if (item[0] == 0) | (item[0] == 2):
length = length + int(item[1])
return length
###*******************************************************************************
if (len(sys.argv) < 2 ):
print("Sample Call:")
print("Sample Call: python3 samParser.py in.sam out.bed")
sys.exit()
fin = open(sys.argv[1],'r')
fout = open(sys.argv[2], 'w')
readDict = collections.defaultdict(str)
while True:
read = fin.readline()
if not read:
break ## EOF
read = read.rstrip()
readDat = read.split("\t")
seq =readDat[9]
if (int(readDat[1]) & 16) == 16:
seq=str(Seq(seq).reverse_complement())
readDict[seq]=collections.defaultdict(str)
fin.seek(0)
while True:
read = fin.readline()
if not read:
break ## EOF
read = read.rstrip()
readDat = read.split("\t")
seq =readDat[9]
if (int(readDat[1]) & 16) == 16:
seq=str(Seq(seq).reverse_complement())
strand = "-"
else:
strand = "+"
readDict[seq]["counts"] = {"H1_GACGAC":0, "H2_TAATCG":0, "H3_TACAGC":0,"Lx1_CTAGCT":0, "Lx2_CTATAC":0, "Lx3_CTCAGA":0, "Total":0}
names = readDat[0].split(">")[1:]
for i in range(len(names)):
nameDat = names[i].split(":")
readDict[seq]["counts"][nameDat[0]] = nameDat[2]
readDict[seq]["counts"]["Total"] += int(nameDat[2])
start = int(readDat[3])-1
chrom = readDat[2]
cigar = readDat[5]
seegar = parse(cigar)
length = findLength(seegar)
end = start+length
datDict = collections.defaultdict(str)
for item in readDat:
itemList=item.split(":")
if len(itemList)>2:
datDict[itemList[0]]=itemList[2]
alignmentScore = datDict["AS"]
location = ':'.join([chrom, str(start), str(end)])
oldData = readDict[seq]["data"]
addData = '|'.join([location, strand, alignmentScore, cigar])
readDict[seq]["data"] = ';'.join([oldData,addData])
for seq in readDict:
counts = ""
for k in sorted(readDict[seq]['counts'].keys()):
counts = ':'.join([counts,str(readDict[seq]['counts'][k])])
locations = readDict[seq]['data'].split(";")[1:] ## locations[0] is empty
alignmentScores = []
for loc in locations:
locDat = loc.split("|")
alignmentScores.append(int(locDat[2]))
bestScore = max(alignmentScores)
nBestScores = sum([x==bestScore for x in alignmentScores])
for loc in locations:
locDat = loc.split("|")
if int(locDat[2]) == bestScore:
locationDat = locDat[0].split(":")
fout.writelines('\t'.join([locationDat[0], locationDat[1], locationDat[2], counts, locDat[2], locDat[1], seq, str(nBestScores), "\n"]))
fin.close()
fout.close()chr5 44353615 44353632 :0:0:0:0:1:0:1 0 - TCAGTTCAATGGTTGGC 1000