-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata-loader.R
More file actions
127 lines (102 loc) · 3.62 KB
/
data-loader.R
File metadata and controls
127 lines (102 loc) · 3.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# load packages
pkgs = readr::read_lines('fcs-requirements.txt')
for(i in pkgs) suppressPackageStartupMessages(library(i, character.only = TRUE))
# define the python interpreter to use
reticulate::use_python('/usr/local/Caskroom/miniconda/base/envs/r-reticulate-env/bin/python')
# helper
metaFromFileName = function(f){
# patterns
regex_tissue_type = '^[HNT]?'
regex_patient_number = 'BB[0-9]{3}'
regex_breast = '.*BB[0-9]{3}([LR]?).*'
regex_tumor_region = '.*BB[0-9]{3}[LR]?([ab])?.*'
regex_plate = 'Plate[1-2]?'
regex_plate.location = '[12]-[A-Z][0-9]{1,2}'
regex_gadolinium = '.*_GD(neg|pos).fcs$'
# extract
tissue_type = regmatches(f, regexpr(regex_tissue_type, f))
patient_number = regmatches(f, regexpr(regex_patient_number, f))
breast = sub(regex_breast, '\\1', f)
tumor_region = sub(regex_tumor_region, '\\1', f)
plate = regmatches(f, regexpr(regex_plate, f))
plate.location = regmatches(f, regexpr(regex_plate.location, f))
gadolinium = sub(regex_gadolinium, '\\1', f)
# verify extraction
stopifnot(
nchar(tissue_type) == 1,
nchar(patient_number) == 5,
nchar(breast) == 0 | nchar(breast) == 1,
nchar(tumor_region) == 0 | nchar(tumor_region) == 1,
nchar(plate) == 6,
nchar(plate.location) >= 4 & nchar(plate.location) <= 5,
nchar(gadolinium) == 3
)
f.meta = tibble(tissue_type = tissue_type,
patient_number = patient_number,
breast = breast,
tumor_region = tumor_region,
plate = plate,
plate_location = plate.location,
gadolinium_status = gadolinium,
fcs_file = f
)
return(f.meta)
}
# files
root = '/Users/art/Library/CloudStorage/Box-Box/STAR_protocol/Data/'
level = 'Cells_CisPtneg_GDneg'
files = dir(paste(root, level, sep='/'))
files.fcs = files[grepl('.fcs$', files)]
# mappings
rename.sample = readr::read_csv(paste(root, level, 'Sample_rename.csv', sep='/'))
map.sample = map(rename.sample$sample_rename, ~ .x)
names(map.sample) = rename.sample$sample
rename.channel = readr::read_csv(paste(root, level, 'Channel_rename.csv', sep='/'))
map.channels = map(rename.channel$channel_rename, ~ .x)
names(map.channels) = rename.channel$channel
# meta data
file.meta = '/Users/art/Library/CloudStorage/Box-Box/STAR_protocol/scQUEST_Patient_Metadata.csv'
meta = read_csv(file.meta)
meta %<>% rename(patient_number = `Patient ID`) %>% drop_na()
# load FCS file
X = NULL
OBS = NULL
VAR = NULL
fcs.header = list()
f.count = 0
for(f in files.fcs){
f.count = f.count + 1
cat(f.count, '/', length(files.fcs), ' reading in file ', f, '\n')
file.name = paste(root, level, f, sep='/')
header = read.FCSheader(file.name)
header.tbl = tibble(keyword = names(header[[1]]), value=header[[1]])
fcs <- read.FCS(file.name)
obs = metaFromFileName(f)[rep(1, nrow(fcs)), ]
var = parameters(fcs)@data %>% select(-range, -minRange, -maxRange)
x = exprs(fcs)
X = rbind(X,x)
OBS = rbind(OBS, obs)
stopifnot(!any(is.na(OBS$tissue_type)))
if(is.null(VAR)) VAR = var
else stopifnot(all(VAR == var))
stopifnot(!f %in% names(fcs.header))
fcs.header[[f]] = header.tbl
}
# tidy data
VAR %<>% rename(channel = name) %>%
mutate(desc = unlist(map.channels[channel]))
# add meta data
indices = match(OBS$patient_number, meta$patient_number)
META = meta[indices,]
OBS = cbind(OBS, META %>% select(-patient_number))
# create AnnData object
ad = AnnData(
X = X,
var = VAR,
obs = OBS,
uns = list(
fcs_header = fcs.header
)
)
#saveRDS(ad, paste(root, level, 'ad.rds', sep='/'))
anndata::write_h5ad(ad, paste(root, level, 'ad.h5ad', sep='/'))