CA_Classification/CA_Class.py at main · MisterMasa/CA_Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Please cite the following article if you use this code:
# Gheibzadeh MS, Manyumwa CV, Tastan Bishop Ö, Shahbani Zahiri H, Parkkila S, Zolfaghari Emameh R.
# Genome Study of α-, β-, and γ-Carbonic Anhydrases from the Thermophilic Microbiome of Marine Hydrothermal Vent Ecosystems.
# Biology (Basel). 2023 May 25;12(6):770. doi: 10.3390/biology12060770.


import re, json
from collections import Counter
from dataclasses import dataclass, field, asdict
from typing import List, Tuple, Optional

# -----------------------------
# Pattern definitions
# -----------------------------
AMINO = r"[ACDEFGHIKLMNPQRSTVWY]"
GAP_MIN, GAP_MAX = 0, 300

ALPHA_PATTERN = re.compile(rf"(H{AMINO}H{AMINO}{{16}}H)")
BETA_PATTERN  = re.compile(rf"(C{AMINO}D{AMINO}R){AMINO}{{{GAP_MIN},{GAP_MAX}}}(H{AMINO}{{2}}C)")
GAMMA_PATTERN = re.compile(rf"(N{AMINO}Q{AMINO}{{5}}H){AMINO}{{{GAP_MIN},{GAP_MAX}}}(H{AMINO}{{4}}H)")

# -----------------------------
# Data structures
# -----------------------------
@dataclass
class MotifMatch:
    family: str
    span: Tuple[int,int]
    segments: List[str]
    gap_len: Optional[int] = None

@dataclass
class ClassificationResult:
    id: str
    sequence: str
    family: str
    matches: List[MotifMatch] = field(default_factory=list)

# -----------------------------
# Motif search functions
# -----------------------------
def find_alpha(seq: str) -> List[MotifMatch]:
    return [MotifMatch("Alpha", m.span(1), [m.group(1)]) for m in ALPHA_PATTERN.finditer(seq)]

def find_beta(seq: str) -> List[MotifMatch]:
    matches=[]
    for m in BETA_PATTERN.finditer(seq):
        gap_len = m.span(2)[0]-m.span(1)[1]
        matches.append(MotifMatch("Beta",(m.span(1)[0],m.span(2)[1]),[m.group(1),m.group(2)],gap_len))
    return matches

def find_gamma(seq: str) -> List[MotifMatch]:
    matches=[]
    for m in GAMMA_PATTERN.finditer(seq):
        gap_len = m.span(2)[0]-m.span(1)[1]
        matches.append(MotifMatch("Gamma",(m.span(1)[0],m.span(2)[1]),[m.group(1),m.group(2)],gap_len))
    return matches

def classify_sequence(seq_id:str, seq:str)->ClassificationResult:
    clean = re.sub(r"[^A-Z]","",seq.upper())
    alpha,beta,gamma = find_alpha(clean),find_beta(clean),find_gamma(clean)
    families=[]
    if alpha: families.append("Alpha")
    if beta:  families.append("Beta")
    if gamma: families.append("Gamma")
    family="None" if not families else families[0] if len(families)==1 else "Ambiguous"
    return ClassificationResult(seq_id,clean,family,alpha+beta+gamma)

# -----------------------------
# Reading FASTA
# -----------------------------
def read_fasta_file(filepath):
    records,seq_id,buf=[],None,[]
    with open(filepath) as f:
        for line in f:
            line=line.strip()
            if not line: continue
            if line.startswith(">"):
                if seq_id: records.append((seq_id,"".join(buf)))
                seq_id=line[1:].strip(); buf=[]
            else: buf.append(line)
    if seq_id: records.append((seq_id,"".join(buf)))
    return records

# -----------------------------
# Writing FASTA outputs
# -----------------------------
def write_fasta(filename, records, line_length=70):
    with open(filename, "w") as f:
        for sid, seq in records:
            f.write(f">{sid}\n")
            # Split sequence into fixed-length lines
            for i in range(0, len(seq), line_length):
                f.write(seq[i:i+line_length] + "\n")

# -----------------------------
# Main execution
# -----------------------------
def main():
    input_path="CA.fasta"   # Input file
    records=read_fasta_file(input_path)
    results=[classify_sequence(sid,seq) for sid,seq in records]

    # Classification for FASTA files
    alpha=[(r.id,r.sequence) for r in results if r.family=="Alpha"]
    beta=[(r.id,r.sequence) for r in results if r.family=="Beta"]
    gamma=[(r.id,r.sequence) for r in results if r.family=="Gamma"]
    none=[(r.id,r.sequence) for r in results if r.family=="None"]

    write_fasta("alpha.txt",alpha)
    write_fasta("beta.txt",beta)
    write_fasta("gamma.txt",gamma)
    write_fasta("none.txt",none)

    # Overall statistics
    counts=Counter([r.family for r in results])
    total=len(results)
    print("Statistics:")
    for fam,count in counts.items():
        print(f"{fam}: {count} ({count/total:.1%})")

    # Save statistics to text file
    with open("statistics.txt", "w") as f:
        f.write("Statistics:\n")
        for fam, count in counts.items():
            f.write(f"{fam}: {count} ({count/total:.1%})\n")

    # Save detailed results to JSON
    with open("detailed_results.json","w") as f:
        json.dump([asdict(r) for r in results], f, ensure_ascii=False, indent=2)

if __name__=="__main__":
    main()