-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdataset.py
More file actions
33 lines (29 loc) · 1.26 KB
/
dataset.py
File metadata and controls
33 lines (29 loc) · 1.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from Bio import SeqIO
from datasets import Dataset
import torch
class UniRefDataset(Dataset):
def __init__(self, dataset_path: str, tokenizer, device: torch.device, max_len: int, num_data: int = int(1e5)) -> None:
self.device = device
self.tokenizer = tokenizer
self.max_len = max_len
self.num_data = num_data
self.dataset_path = dataset_path
self.seq_gen = None
def __len__(self) -> int:
return int(self.num_data)
def __getitem__(self, index):
if self.seq_gen is None:
self.seq_gen = self.sequence_generator()
return next(self.seq_gen)
def sequence_generator(self):
with open(self.dataset_path) as handle:
for record in SeqIO.parse(handle, "fasta"):
seq = str(record.seq)
encoded = self.tokenizer(seq, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
input_ids = encoded['input_ids'].to(self.device)
attention_mask = encoded['attention_mask'].to(self.device)
yield {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': input_ids.clone()
}