Research_project/final_script.py at main · mathilde733/Research_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp

# ----------------------------------------------------------------------------------------
################ HET_sample ##################
# Path to the cleaned gene expression matrix (genes x cells) for the HET condition
f_exprMat = 'expr_mat_HET_clean.tsv'

# Load the expression matrix as an AnnData object using Scanpy
# Rows correspond to genes and columns to cells
adata = sc.read_text(f_exprMat, delimiter='\t', first_column_names=True)

# Define row attributes for the loom file (gene-level metadata)
row_attrs = {
    # Gene names extracted from the AnnData object
    "Gene": np.array(adata.var.index),
}

# Define column attributes for the loom file (cell-level metadata)
col_attrs = {
    # Unique cell identifiers
    "CellID": np.array(adata.obs.index),

    # Number of detected genes per cell (genes with non-zero expression)
    "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten(),

    # Total number of UMIs per cell (sum of expression values)
    "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten(),
}

# Output path for the unfiltered loom file
f_loom_path_unfilt = 'expr_mat_HET_clean.loom'

# Create the loom file required for downstream pySCENIC analysis
# Note: matrix is transposed to match loom format (genes x cells)
lp.create(f_loom_path_unfilt, adata.X.transpose(), row_attrs, col_attrs)

# ----------------------------------------------------------------------------------------
################ KO_sample ##################
# Path to the cleaned gene expression matrix for the KO condition
f_exprMat = 'expr_mat_KO_clean.tsv'

# Load expression matrix
adata = sc.read_text(f_exprMat, delimiter='\t', first_column_names=True)

# Gene-level metadata
row_attrs = {
    "Gene": np.array(adata.var.index),
}

# Cell-level metadata
col_attrs = {
    "CellID": np.array(adata.obs.index),
    "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten(),
    "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten(),
}

# Output loom file for KO sample
f_loom_path_unfilt = 'expr_mat_KO_clean.loom'

# Create loom file
lp.create(f_loom_path_unfilt, adata.X.transpose(), row_attrs, col_attrs)

# ----------------------------------------------------------------------------------------
################ WT_sample ##################
# Path to the cleaned gene expression matrix for the WT condition
f_exprMat = 'expr_mat_WT_clean.tsv'

# Load expression matrix
adata = sc.read_text(f_exprMat, delimiter='\t', first_column_names=True)

# Gene-level metadata
row_attrs = {
    "Gene": np.array(adata.var.index),
}

# Cell-level metadata
col_attrs = {
    "CellID": np.array(adata.obs.index),
    "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten(),
    "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten(),
}

# Output loom file for WT sample
f_loom_path_unfilt = 'expr_mat_WT_clean.loom'

# Create loom file
lp.create(f_loom_path_unfilt, adata.X.transpose(), row_attrs, col_attrs)