forked from HUBioDataLab/ProtMamba
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_dataset.py
More file actions
25 lines (19 loc) · 795 Bytes
/
create_dataset.py
File metadata and controls
25 lines (19 loc) · 795 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import csv
# Define the input and output file paths
input_file = "/media/ubuntu/8TB/mennan/data/uniref50.fasta"
output_file = 'protein_sequences.csv'
# Open the input file and read the lines
with open(input_file, 'r') as txt_file:
lines = txt_file.readlines()
# Open the output file and write the sequences to it
with open(output_file, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
# Write the header
writer.writerow(['sequence'])
# Write each line from the txt file as a new row in the csv file
for line in lines:
# Strip any leading/trailing whitespace characters
sequence = line.strip()
# Write the sequence to the csv file
writer.writerow([sequence])
print(f"CSV file '{output_file}' created successfully.")