-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProject Graphs
More file actions
77 lines (57 loc) · 2.63 KB
/
Project Graphs
File metadata and controls
77 lines (57 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#table test
#average read quality scores should be >7 (Phred 33)
#can graph quality scores in the same way
library(ShortRead)
library(ggplot2)
library(dplyr)
source('https://raw.githubusercontent.com/developing-bioinformatics/eDNA_BLAST/master/R/core.R')
download_sra(sample = 'Swamp_S1_', dir.out = 'data')
#read lengths
files = list.files('data', pattern='.fastq', full.names=TRUE)
print(files)
get_readlengths <- function(filename){
dna = readFastq(filename)
reads = sread(dna)
readlengths = reads@ranges@width
ret = cbind(rep(filename, length(readlengths)), readlengths)
ret=as.data.frame(ret)
colnames(ret) = c('file', 'readlength')
ret$readlength = as.numeric(ret$readlength)
return(ret)
}
get_readlengths(files)
results = lapply(files, get_readlengths)
restable = bind_rows(results)
ggplot(restable) + geom_histogram(aes(x=readlength)) + facet_grid(~file)
ggplot(restable) + geom_density(aes(x=readlength, color=file))
#quality scores
files = list.files('data', pattern='.fastq', full.names=TRUE)
print(files)
get_qualityscores <- function(filename){
dna = readFastq(filename)
qscores = quality(dna)
numqscores = as(qscores, "matrix") # converts to numeric scores automatically
avgscores = apply(numqscores, 1, mean, na.rm=TRUE)
avgscores = as.data.frame(avgscores)
ret = cbind(rep(filename, nrow(avgscores)), avgscores)
ret=as.data.frame(ret)
colnames(ret) = c('file', 'qualityscore')
#ret$avgscores = as.numeric(ret$avgscores)
return(ret)
}
get_qualityscores(files)
results = lapply(files, get_qualityscores)
restable = bind_rows(results)
ggplot(restable) + geom_histogram(aes(x=qualityscore)) + facet_grid(~file)
ggplot(restable) + geom_density(aes(x=qualityscore, color=file))
#can predict that second from left (high qulity and longer) that sample gives us the most results
#and most confidence in our results (reads should be good matches)
#filtering for read length would potentially change results a lot
#plotting out the read lengths can predict which of these samples accurately gives us results
#follow up: BLAST all of these and figure out which is true - do we get more IDs with longer reads or not?
#what happens if we impose a filter?
#in results: might identify some of the files we looked at that have a problem vs the ones that look like they're ok
#can I BLAST ITS2 and one other and see if we get more results? that would mean we know to look for graphs like
#ITS2 ahead of time for more results
#with out the ggplot facet function makes just one histogram. the facet tells ggplot to create one subplot for every element in that facet (We are faceting on the file variable)
#basically does a groupby function