From 1e4f99b063c145e2ff151c6b33af1bdeea5ed851 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Anh Tran Date: Mon, 27 Jun 2022 12:24:59 -0700 Subject: [PATCH 1/6] Add roberta implementation for MLM using smile strings. --- applications/nlp/Roberta_atom/config.json | 22 ++ applications/nlp/Roberta_atom/dataset.py | 130 ++++++++ .../nlp/Roberta_atom/get_model_config.py | 114 +++++++ applications/nlp/Roberta_atom/main.py | 311 ++++++++++++++++++ python/lbann/models/__init__.py | 2 +- python/lbann/models/roberta.py | 90 ++++- python/lbann/modules/base.py | 21 +- 7 files changed, 678 insertions(+), 12 deletions(-) create mode 100644 applications/nlp/Roberta_atom/config.json create mode 100644 applications/nlp/Roberta_atom/dataset.py create mode 100644 applications/nlp/Roberta_atom/get_model_config.py create mode 100644 applications/nlp/Roberta_atom/main.py diff --git a/applications/nlp/Roberta_atom/config.json b/applications/nlp/Roberta_atom/config.json new file mode 100644 index 00000000000..7d37f79227b --- /dev/null +++ b/applications/nlp/Roberta_atom/config.json @@ -0,0 +1,22 @@ +{ + "architectures": [ + "RobertaForMaskedLM" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "roberta", + "num_attention_heads": 12, + "num_hidden_layers": 6, + "output_past": true, + "pad_token_id": 1, + "type_vocab_size": 1, + "vocab_size": 767 +} diff --git a/applications/nlp/Roberta_atom/dataset.py b/applications/nlp/Roberta_atom/dataset.py new file mode 100644 index 00000000000..09dca567863 --- /dev/null +++ b/applications/nlp/Roberta_atom/dataset.py @@ -0,0 +1,130 @@ +import numpy as np + +bos_index = 0 +eos_index = 2 +pad_index = 1 +ignore_index = -100 +mask_index = 4 +mask_percent = 0.15 + +sequence_length = 48 +vocab_length = 767 +samples = np.load("/g/g92/tran71/tran71/lbann_new/applications/nlp/Roberta_zinc_base/zinc250k.npy", allow_pickle=True) + +train_samples = samples[:int(samples.size*0.8)] +val_samples = samples[int(samples.size*0.8):int(samples.size*0.9)] +test_samples = samples[int(samples.size*0.9):] + + +# Masking samples +def masking(sample): + sample_masked = sample.copy() + rand = np.random.uniform(size=(1,sequence_length)) + replace = (rand < mask_percent) * (sample != bos_index) * (sample != eos_index) * (sample != pad_index) + mask_idx = np.nonzero(replace)[1] + for idx in mask_idx: + chance = np.random.uniform() + if(chance < 0.1): #replace with random character excluding special characters + sample_masked[idx] = np.random.randint(5,vocab_length) + elif (0.1 < chance < 0.9): #replace with mask character + sample_masked[idx] = mask_index + return sample_masked,mask_idx + +# Train sample access functions +def get_train_sample(index): + sample = train_samples[index] + if len(sample) < sequence_length: + sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index))) + else: + sample = np.resize(sample, sequence_length) + + sample_mask, mask_idx = masking(sample) + + idx = [i for i in range(0,sequence_length)] + non_mask_idx = [i for i in idx if (i not in mask_idx)] + + label = sample.copy() + + label[non_mask_idx] = ignore_index + + sample_all = np.full(3*sequence_length, pad_index, dtype=int) + sample_all[0:len(sample)] = sample + sample_all[sequence_length:2*sequence_length] = sample_mask + sample_all[2*sequence_length:3*sequence_length] = label + + return sample_all + + +# Validation sample access functions +def get_val_sample(index): + sample = val_samples[index] + if len(sample) < sequence_length: + sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index))) + else: + sample = np.resize(sample, sequence_length) + + mask_idx = np.random.randint(0,sequence_length) + #print(mask_idx) + + sample_mask = sample.copy() + sample_mask[mask_idx] = 14 + + idx = [i for i in range(0,sequence_length)] + non_mask_idx = [i for i in idx if (i != mask_idx)] + #print(non_mask_idx) + + label = sample.copy() + + label[non_mask_idx] = ignore_index + + sample_all = np.full(3*sequence_length, pad_index, dtype=int) + sample_all[0:len(sample)] = sample + sample_all[sequence_length:2*sequence_length] = sample_mask + sample_all[2*sequence_length:3*sequence_length] = label + + return sample_all + + +# Test sample access functions +def get_test_sample(index): + sample = test_samples[index] + if len(sample) < sequence_length: + sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index))) + else: + sample = np.resize(sample, sequence_length) + + mask_idx = np.random.randint(0,sequence_length) + + sample_mask = sample.copy() + sample_mask[mask_idx] = 14 + + idx = [i for i in range(0,sequence_length)] + non_mask_idx = [i for i in idx if (i != mask_idx)] + + label = sample.copy() + + label[non_mask_idx] = ignore_index + + sample_all = np.full(3*sequence_length, pad_index, dtype=int) + sample_all[0:len(sample)] = sample + sample_all[sequence_length:2*sequence_length] = sample_mask + sample_all[2*sequence_length:3*sequence_length] = label + + return sample_all + +def num_train_samples(): + return train_samples.shape[0] + +def num_val_samples(): + return val_samples.shape[0] + +def num_test_samples(): + return val_samples.shape[0] + +def sample_dims(): + return (3*sequence_length+1,) + +def vocab_size(): + return 767 + + diff --git a/applications/nlp/Roberta_atom/get_model_config.py b/applications/nlp/Roberta_atom/get_model_config.py new file mode 100644 index 00000000000..fca0bd5d50b --- /dev/null +++ b/applications/nlp/Roberta_atom/get_model_config.py @@ -0,0 +1,114 @@ +import sys +import os +import warnings +import itertools +import time +import glob +import urllib.request +import argparse + +import numpy as np +import torch + +files = { + "config.json": "https://huggingface.co/seyonec/ChemBERTa-zinc-base-v1/resolve/main/config.json", + "pytorch_model.bin": "https://huggingface.co/seyonec/ChemBERTa-zinc-base-v1/resolve/main/pytorch_model.bin", +} +weights_dir = "pretrained_weights" + + +def download_file(url, fn): + def report_hook(count, block_size, total_size): + duration = int(time.time() - start_time) + progress_size = int(count * block_size / (1024 ** 2)) + percent = min(int(count * block_size * 100 / total_size), 100) + prog_bar = "|" + "#" * int(percent / 2) + "-" * (50 - int(percent / 2)) + "|" + sys.stdout.write( + f"\r{prog_bar} {percent}%, {progress_size} MB, {duration}s elapsed" + ) + sys.stdout.flush() + + if os.path.exists(fn): + warnings.warn(f"File '{fn}' already exists, skipping download") + else: + print(f"\n\nDownloading {fn} from {url}\n") + start_time = time.time() + urllib.request.urlretrieve(url, fn, report_hook) + + +def extract_weights(model, weights_dir): + for name, weights in model.items(): + weights = np.array(weights).astype(np.float32) + np.save(f"./{weights_dir}/{name}.npy", weights) + + +def process_weights(weights_dir): + # Combine layernorm weights and bias to single file + layernorm_files = glob.glob(f"./{weights_dir}/*LayerNorm*.npy") + layernorm_groups = {} + for fn in layernorm_files: + base_fn = fn.split(".LayerNorm")[0] + if base_fn in layernorm_groups: + layernorm_groups[base_fn].append(fn) + else: + layernorm_groups[base_fn] = [fn] + + for base_fn, fns in layernorm_groups.items(): + weight_fn = [fn for fn in fns if "weight.npy" in fn][0] + bias_fn = [fn for fn in fns if "bias.npy" in fn][0] + + weight_bias_vals = np.stack([np.load(weight_fn), np.load(bias_fn)]).T.copy() + np.save(f"{base_fn}.layernorm.weightbias.npy", weight_bias_vals) + + # Combine layer_norm weights and bias to single file + layer_norm_files = glob.glob(f"./{weights_dir}/*layer_norm*.npy") + layer_norm_groups = {} + for fn in layer_norm_files: + base_fn = fn.split(".layer_norm")[0] + if base_fn in layer_norm_groups: + layer_norm_groups[base_fn].append(fn) + else: + layer_norm_groups[base_fn] = [fn] + + for base_fn, fns in layer_norm_groups.items(): + weight_fn = [fn for fn in fns if "weight.npy" in fn][0] + bias_fn = [fn for fn in fns if "bias.npy" in fn][0] + + weight_bias_vals = np.stack([np.load(weight_fn), np.load(bias_fn)]).T.copy() + np.save(f"{base_fn}.layer_norm.weightbias.npy", weight_bias_vals) + + # Transpose embedding layer weights + embed_files = [ + glob.glob(f"{weights_dir}/{e}.npy") + for e in ( + "*position_embeddings*", + "*token_type_embeddings*", + "*word_embeddings*", + ) + ] + embed_files = itertools.chain(*embed_files) + for fn in embed_files: + np.save(fn, np.load(fn).T.copy()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--no-weights', action='store_true', help='avoids downloading model weights') + args = parser.parse_args() + + if args.no_weights: + del files['pytorch_model.bin'] + + """Download model from huggingface""" + for fn, url in files.items(): + download_file(url, fn) + + if not args.no_weights: + """ Extract weights """ + if not os.path.exists(weights_dir): + os.makedirs(weights_dir) + model = torch.load("pytorch_model.bin", map_location="cpu") + extract_weights(model, weights_dir) + + """ Process weights for loading into LBANN """ + process_weights(weights_dir) diff --git a/applications/nlp/Roberta_atom/main.py b/applications/nlp/Roberta_atom/main.py new file mode 100644 index 00000000000..595afe688d7 --- /dev/null +++ b/applications/nlp/Roberta_atom/main.py @@ -0,0 +1,311 @@ +from types import SimpleNamespace +import argparse +import datetime +import os +import sys +import json +import numpy as np + +import lbann +import lbann.contrib.args +import lbann.contrib.launcher + +from lbann.models import RoBERTaMLM + + +# Local imports +current_dir = os.path.dirname(os.path.realpath(__file__)) +root_dir = os.path.dirname(current_dir) +sys.path.append(root_dir) +import utils.paths + + + +import dataset +# Dataset properties +vocab_size = dataset.vocab_size() +sequence_length = dataset.sequence_length +pad_index = dataset.pad_index +ignore_index = dataset.ignore_index + +# ---------------------------------------------- +# Options +# ---------------------------------------------- +parser = argparse.ArgumentParser() +parser.add_argument( + "--epochs", + default=51, + type=int, + help="number of epochs to train", +) +parser.add_argument( + "--mini-batch-size", + default=256, + type=int, + help="size of minibatches for training", +) +parser.add_argument( + "--job-name", + action="store", + default="RoBERTa_MLM", + type=str, + help="scheduler job name", + metavar="NAME", +) +parser.add_argument( + "--work-dir", + action="store", + default=None, + type=str, + help="working directory", + metavar="DIR", +) +parser.add_argument("--batch-job", action="store_true", help="submit as batch job") +parser.add_argument( + "--checkpoint", action="store_true", help="checkpoint trainer after every epoch" +) +lbann.contrib.args.add_scheduler_arguments(parser) +lbann_params = parser.parse_args() + + + +# ---------------------------------------------- +# Work directory +# ---------------------------------------------- + +timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') +work_dir = os.path.join( + utils.paths.root_dir(), + 'Roberta_atom/exps', + f'{timestamp}_{lbann_params.job_name}', +) +os.makedirs(work_dir, exist_ok=True) + + + +# ---------------------------------------------- +# Data Reader +# ---------------------------------------------- +def make_data_reader(): + reader = lbann.reader_pb2.DataReader() + + # Train data reader + _reader = reader.reader.add() + _reader.name = "python" + _reader.role = "train" + _reader.shuffle = True + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = "dataset" + _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + _reader.python.sample_function = "get_train_sample" + _reader.python.num_samples_function = "num_train_samples" + _reader.python.sample_dims_function = "sample_dims" + + # Validation data reader + _reader = reader.reader.add() + _reader.name = "python" + _reader.role = "validate" + _reader.shuffle = False + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = "dataset" + _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + _reader.python.sample_function = "get_val_sample" + _reader.python.num_samples_function = "num_val_samples" + _reader.python.sample_dims_function = "sample_dims" + + # Test data reader + _reader = reader.reader.add() + _reader.name = "python" + _reader.role = "test" + _reader.shuffle = False + _reader.percent_of_data_to_use = 1.0 + _reader.python.module = "dataset" + _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) + _reader.python.sample_function = "get_test_sample" + _reader.python.num_samples_function = "num_test_samples" + _reader.python.sample_dims_function = "sample_dims" + + return reader + + + +# ---------------------------------------------- +# Build and Run Model +# ---------------------------------------------- +with open("./config.json") as f: + config = json.load(f, object_hook=lambda d: SimpleNamespace(**d)) +config.input_shape = (1,sequence_length) + +config.load_weights = os.path.abspath('./pretrained_weights') + + +# Construct the model + +# Input is 3 sequences of smile string: original string, masked string, label string - every token is -100 (ignore) except the masked token. +input_ = lbann.Input(data_field='samples') + + +input_strings = lbann.Identity(lbann.Slice( + input_, + axis=0, + slice_points=[0,sequence_length], + name='input_strings' +)) + +input_masked = lbann.Identity(lbann.Slice( + input_, + axis=0, + slice_points=[sequence_length,2*sequence_length], + name='input_masked' +)) + +input_label = lbann.Identity(lbann.Slice( + input_, + axis=0, + slice_points=[2*sequence_length,3*sequence_length], + name='input_label' +)) + + +robertamlm = RoBERTaMLM(config,load_weights=config.load_weights) +output = robertamlm(input_masked) + +preds_output = lbann.Identity(output,name='pred') + +preds = lbann.ChannelwiseSoftmax(output, name='pred_sm') +preds = lbann.Slice(preds, axis=1, slice_points=range(sequence_length+1),name='slice_pred') +preds = [lbann.Identity(preds) for _ in range(sequence_length)] + + +######## +# Loss +######## + +# Count number of non-pad tokens +label_tokens = lbann.Identity(input_label) +pads = lbann.Constant(value=ignore_index, num_neurons=sequence_length,name='pads') + +is_not_pad = lbann.NotEqual(label_tokens, pads,name='is_not_pad') +num_not_pad = lbann.Reduction(is_not_pad, mode='sum',name='num_not_pad') + +# Cross entropy loss +label_tokens = lbann.Slice( + label_tokens, + slice_points=range(sequence_length+1), + name='label_tokens', + ) + +label_tokens = [lbann.Identity(label_tokens) for _ in range(sequence_length)] + +loss = [] + + +for i in range(sequence_length): + label = lbann.OneHot(label_tokens[i], size=config.vocab_size) + label = lbann.Reshape(label, dims=[config.vocab_size]) + pred = lbann.Reshape(preds[i], dims=[config.vocab_size]) + loss.append(lbann.CrossEntropy(pred, label)) + + +loss = lbann.Concatenation(loss) + + +# Average cross entropy over non-pad tokens +loss_scales = lbann.SafeDivide( + is_not_pad, + lbann.Tessellate(num_not_pad, hint_layer=is_not_pad), + name = 'loss_scale', + ) +loss = lbann.Multiply(loss, loss_scales) + + +obj = lbann.Reduction(loss, mode='sum',name='loss_red') + +metrics = [lbann.Metric(obj, name="loss")] + + +########### +# Callbacks +########### + +callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(),] + + +callbacks.append( + lbann.CallbackDumpOutputs( + batch_interval=782, + execution_modes='train', + directory=os.path.join(work_dir, 'train_input'), + layers='input_strings') + ) + + +callbacks.append( + lbann.CallbackDumpOutputs( + batch_interval=782, + execution_modes='train', + directory=os.path.join(work_dir, 'train_output'), + layers='pred ') + ) + +callbacks.append( + lbann.CallbackDumpOutputs( + batch_interval=50, + execution_modes='test', + directory=os.path.join(work_dir, 'test_input'), + layers='input_strings') + ) + +callbacks.append( + lbann.CallbackDumpOutputs( + batch_interval=50, + execution_modes='test', + directory=os.path.join(work_dir, 'test_output'), + layers='pred') + ) + + +callbacks.append( + lbann.CallbackDumpWeights( + directory=os.path.join(work_dir, 'weights'), + epoch_interval=1, + ) + ) + + +model = lbann.Model( + lbann_params.epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=obj, + metrics=metrics, + callbacks=callbacks, +) + +# Setup trainer, optimizer, data_reader +trainer = lbann.Trainer( + mini_batch_size=lbann_params.mini_batch_size, + num_parallel_readers=1, +) +optimizer = lbann.Adam( + learn_rate=0.0001, + beta1=0.9, + beta2=0.98, + eps=1e-8, +) +data_reader = make_data_reader() + +# Launch LBANN +kwargs = lbann.contrib.args.get_scheduler_kwargs(lbann_params) +kwargs["environment"] = {} +lbann.contrib.launcher.run( + trainer, + model, + data_reader, + optimizer, + work_dir=work_dir, + job_name=lbann_params.job_name, + lbann_args=["--num_io_threads=1"], + batch_job=lbann_params.batch_job, + **kwargs, +) diff --git a/python/lbann/models/__init__.py b/python/lbann/models/__init__.py index 80d84d536c0..7281cb85453 100644 --- a/python/lbann/models/__init__.py +++ b/python/lbann/models/__init__.py @@ -1,5 +1,5 @@ from lbann.models.alexnet import AlexNet from lbann.models.lenet import LeNet from lbann.models.resnet import ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 -from lbann.models.roberta import RoBERTa +from lbann.models.roberta import RoBERTa, RoBERTaMLM from lbann.models.transformer import Transformer, TransformerEncoderLayer, TransformerDecoderLayer diff --git a/python/lbann/models/roberta.py b/python/lbann/models/roberta.py index b00d933b4cd..72cfd0b4eb5 100644 --- a/python/lbann/models/roberta.py +++ b/python/lbann/models/roberta.py @@ -16,6 +16,7 @@ } + def create_position_ids_from_input_ids( input_ids, input_shape, padding_idx, past_key_values_length=0 ): @@ -58,7 +59,6 @@ def _load_pretrained_weights( weights = weights[0] return weights - class RobertaEmbeddings(lbann.modules.Module): def __init__(self, config, name, load_weights=True): super().__init__() @@ -627,3 +627,91 @@ def forward( return pooled_output else: return encoder_output + +class RobertaLMHead(lbann.modules.Module): + """Roberta Head for masked language modeling.""" + + def __init__(self, config, name,load_weights=True): + self.config = config + + # A custom directory can be passed instead of True/False + if isinstance(load_weights, str): + if not os.path.isdir(load_weights): + raise ValueError(f"Path to pretrained weights does not exist: {load_weights}") + + self.input_shape = config.input_shape + (config.hidden_size,) + self.hidden_size = config.hidden_size + self.vocab_size = config.vocab_size + self.hidden_dropout_prob = config.hidden_dropout_prob + self.layer_norm_eps = config.layer_norm_eps + self.name = name + self.load_weights = load_weights + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, input_tensor): + + #x = self.dense(features) + hidden_states, hidden_shape = lbann.modules.PytorchLinear( + input_tensor, + self.input_shape, + self.hidden_size, + weights=_load_pretrained_weights( + ".".join((self.name, "dense.weight")), + ".".join((self.name, "dense.bias")), + load_weights=self.load_weights, + ), + name=".".join((self.name, "dense")), + return_dims=True, + ) + + #x = gelu(x) + hidden_states = self.intermediate_act_fn(hidden_states) + + #x = self.layer_norm(x) + hidden_states = lbann.modules.PytorchLayerNorm( + lbann.Add(hidden_states, input_tensor), + self.layer_norm_eps, + hidden_shape, + weights=_load_pretrained_weights( + ".".join((self.name, "layer_norm.weightbias")), + load_weights=self.load_weights, + ), + name=".".join((self.name, "layer_norm")), + ) + + #x = self.decoder(x) + hidden_states, hidden_shape = lbann.modules.PytorchLinear( + input_tensor, + hidden_shape, + self.vocab_size, + weights=_load_pretrained_weights( + ".".join((self.name, "decoder.weight")), + ".".join((self.name, "decoder.bias")), + load_weights=self.load_weights, + ), + name=".".join((self.name, "decoder")), + return_dims=True, + ) + + return hidden_states + +class RoBERTaMLM(lbann.modules.Module): + def __init__(self, config, load_weights=True): + + # A custom directory can be passed instead of True/False + if isinstance(load_weights, str): + if not os.path.isdir(load_weights): + raise ValueError(f"Path to pretrained weights does not exist: {load_weights}") + + self.roberta = RoBERTa(config, add_pooling_layer=False, load_weights=load_weights) + self.lm_head = RobertaLMHead(config, "lm_head",load_weights=load_weights) + + def forward(self,input_ids): + + output = self.roberta(input_ids) + output = self.lm_head(output) + + return output diff --git a/python/lbann/modules/base.py b/python/lbann/modules/base.py index 92bae3a28e9..4338c9524bb 100644 --- a/python/lbann/modules/base.py +++ b/python/lbann/modules/base.py @@ -64,8 +64,8 @@ def __init__(self, weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most two: the matrix and the bias. If weights are not provided, the - matrix will be initialized with He normal - initialization and the bias with zeros. + matrix will be initialized with Lecun uniform + initialization and the bias with Lecun uniform. name (str): Default name is in the form 'fcmodule'. data_layout (str): Data layout. parallel_strategy (dict): Data partitioning scheme. @@ -86,8 +86,8 @@ def __init__(self, # Initialize weights # Note: If weights are not provided, matrix weights are - # initialized with He normal scheme and bias weights are - # initialized with zeros. + # initialized with Lecun uniform scheme and bias weights are + # initialized with Lecun uniform scheme. self.weights = list(make_iterable(weights)) if len(self.weights) > 2: raise ValueError('`FullyConnectedModule` has ' @@ -95,11 +95,11 @@ def __init__(self, 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( - lbann.Weights(initializer=lbann.HeNormalInitializer(), + lbann.Weights(initializer=lbann.LeCunUniformInitializer(), name=self.name+'_matrix')) if self.bias and len(self.weights) == 1: self.weights.append( - lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), + lbann.Weights(initializer=lbann.LeCunUniformInitializer(), name=self.name+'_bias')) # Initialize activation layer @@ -158,8 +158,9 @@ def __init__(self, weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most two: the matrix and the bias. If weights are not provided, the - matrix will be initialized with He normal - initialization and the bias with zeros. + matrix will be initialized with Lecun uniform + initialization and the bias Lecun uniform + initialization. activation (type): Layer class for activation function. name (str): Default name is in the form 'channelwisefc'. parallel_strategy (dict): Data partitioning scheme. @@ -183,11 +184,11 @@ def __init__(self, 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( - lbann.Weights(initializer=lbann.HeNormalInitializer(), + lbann.Weights(initializer=lbann.LeCunUniformInitializer(), name=self.name+'_matrix')) if self.bias and len(self.weights) == 1: self.weights.append( - lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), + lbann.Weights(initializer=lbann.LeCunUniformInitializer(), name=self.name+'_bias')) self.activation = None if activation: From 0811233c174ab8a6429660b18b33def4d56ad7e1 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Anh Tran Date: Mon, 27 Jun 2022 12:41:02 -0700 Subject: [PATCH 2/6] Add re-tokenizing script. --- applications/nlp/Roberta_atom/re_tokenize.py | 53 ++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 applications/nlp/Roberta_atom/re_tokenize.py diff --git a/applications/nlp/Roberta_atom/re_tokenize.py b/applications/nlp/Roberta_atom/re_tokenize.py new file mode 100644 index 00000000000..3207e98d030 --- /dev/null +++ b/applications/nlp/Roberta_atom/re_tokenize.py @@ -0,0 +1,53 @@ +import numpy as np +import pandas as pd + +from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline + +''' +Re-tokenize SMILE string using Huggingface tokenizers. + +''' + +model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1") +tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1") + + +def detokenize(inp,vocab): + ''' + Convert the tokenized zinc to SMILE strings + ''' + output = "" + for i in inp: + token = list(vocab.keys())[list(vocab.values()).index(int(i))] + if(token ==''): + break + if(token[0]!='<'): + output = output+token + + return output + + +samples = np.load("moses_zinc_train250K.npy", allow_pickle=True) + +vocab_file = "vocab_train.txt" + +vocab = pd.read_csv(vocab_file, delimiter=" ", header=None, quoting=3).to_dict()[0] +vocab = dict([(v,k) for k,v in vocab.items()]) + + +samples = [detokenize(i_x,vocab) for i_x in samples] + +print(len(samples)) + +smiles_tokenized=[] + +for s in samples: + tokenize = tokenizer.encode(s) + del tokenize[-2] # remove extra character before + smiles_tokenized.append(tokenize) + +print(len(smiles_tokenized)) + +outfile = "zinc250k.npy" + +np.save(outfile, smiles_tokenized) From f149e8226698460ad4deecaf9733922ea98618f1 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Anh Tran Date: Wed, 29 Jun 2022 13:11:48 -0700 Subject: [PATCH 3/6] Updating masking function based on HuggingFace implementation. --- applications/nlp/Roberta_atom/dataset.py | 83 ++++++++++-------------- 1 file changed, 36 insertions(+), 47 deletions(-) diff --git a/applications/nlp/Roberta_atom/dataset.py b/applications/nlp/Roberta_atom/dataset.py index 09dca567863..483ff6bc3bf 100644 --- a/applications/nlp/Roberta_atom/dataset.py +++ b/applications/nlp/Roberta_atom/dataset.py @@ -16,19 +16,37 @@ test_samples = samples[int(samples.size*0.9):] + # Masking samples -def masking(sample): - sample_masked = sample.copy() - rand = np.random.uniform(size=(1,sequence_length)) - replace = (rand < mask_percent) * (sample != bos_index) * (sample != eos_index) * (sample != pad_index) - mask_idx = np.nonzero(replace)[1] - for idx in mask_idx: - chance = np.random.uniform() - if(chance < 0.1): #replace with random character excluding special characters - sample_masked[idx] = np.random.randint(5,vocab_length) - elif (0.1 < chance < 0.9): #replace with mask character - sample_masked[idx] = mask_index - return sample_masked,mask_idx +''' +https://github.com/huggingface/transformers/blob/v4.20.1/src/transformers/data/data_collator.py#L805 +''' +def masking(sample, mlm_probability = 0.15): + + masked = np.copy(sample) + label = np.copy(sample) + + special_tokens_mask = (sample == bos_index) + (sample == eos_index) + (sample == pad_index) + + probability_matrix = np.full(sample.shape, mlm_probability) + + probability_matrix[special_tokens_mask] = 0 + + masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool) + + label[~masked_indices] = ignore_index + + indices_replaced = np.random.binomial(1, 0.8, size=sample.shape).astype(bool) & masked_indices + + masked[indices_replaced] = mask_index + + indices_random = (np.random.binomial(1, 0.5, size=sample.shape).astype(bool) & masked_indices & ~indices_replaced) + + random_words = np.random.randint(low=5, high=vocab_length, size=np.count_nonzero(indices_random), dtype=np.int64) + + masked[indices_random] = random_words + + return sample,masked,label # Train sample access functions def get_train_sample(index): @@ -38,18 +56,11 @@ def get_train_sample(index): else: sample = np.resize(sample, sequence_length) - sample_mask, mask_idx = masking(sample) - - idx = [i for i in range(0,sequence_length)] - non_mask_idx = [i for i in idx if (i not in mask_idx)] - - label = sample.copy() - - label[non_mask_idx] = ignore_index + sample,masked,label = masking(sample) sample_all = np.full(3*sequence_length, pad_index, dtype=int) sample_all[0:len(sample)] = sample - sample_all[sequence_length:2*sequence_length] = sample_mask + sample_all[sequence_length:2*sequence_length] = masked sample_all[2*sequence_length:3*sequence_length] = label return sample_all @@ -63,23 +74,11 @@ def get_val_sample(index): else: sample = np.resize(sample, sequence_length) - mask_idx = np.random.randint(0,sequence_length) - #print(mask_idx) - - sample_mask = sample.copy() - sample_mask[mask_idx] = 14 - - idx = [i for i in range(0,sequence_length)] - non_mask_idx = [i for i in idx if (i != mask_idx)] - #print(non_mask_idx) - - label = sample.copy() - - label[non_mask_idx] = ignore_index + sample,masked,label = masking(sample) sample_all = np.full(3*sequence_length, pad_index, dtype=int) sample_all[0:len(sample)] = sample - sample_all[sequence_length:2*sequence_length] = sample_mask + sample_all[sequence_length:2*sequence_length] = masked sample_all[2*sequence_length:3*sequence_length] = label return sample_all @@ -93,21 +92,11 @@ def get_test_sample(index): else: sample = np.resize(sample, sequence_length) - mask_idx = np.random.randint(0,sequence_length) - - sample_mask = sample.copy() - sample_mask[mask_idx] = 14 - - idx = [i for i in range(0,sequence_length)] - non_mask_idx = [i for i in idx if (i != mask_idx)] - - label = sample.copy() - - label[non_mask_idx] = ignore_index + sample,masked,label = masking(sample) sample_all = np.full(3*sequence_length, pad_index, dtype=int) sample_all[0:len(sample)] = sample - sample_all[sequence_length:2*sequence_length] = sample_mask + sample_all[sequence_length:2*sequence_length] = masked sample_all[2*sequence_length:3*sequence_length] = label return sample_all From 4fd136f98d85bd773e2dd2df0ac3028cabbc55cd Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Anh Tran Date: Tue, 26 Jul 2022 16:35:12 -0700 Subject: [PATCH 4/6] Fix weights initialization. Update datareader. --- applications/nlp/Roberta_atom/dataset.py | 64 ++++++++++++++++++------ python/lbann/modules/base.py | 21 ++++---- 2 files changed, 58 insertions(+), 27 deletions(-) diff --git a/applications/nlp/Roberta_atom/dataset.py b/applications/nlp/Roberta_atom/dataset.py index 483ff6bc3bf..71b27997196 100644 --- a/applications/nlp/Roberta_atom/dataset.py +++ b/applications/nlp/Roberta_atom/dataset.py @@ -1,22 +1,22 @@ import numpy as np -bos_index = 0 -eos_index = 2 -pad_index = 1 +bos_index = 12 +eos_index = 13 +pad_index = 0 ignore_index = -100 -mask_index = 4 +mask_index = 14 mask_percent = 0.15 -sequence_length = 48 -vocab_length = 767 -samples = np.load("/g/g92/tran71/tran71/lbann_new/applications/nlp/Roberta_zinc_base/zinc250k.npy", allow_pickle=True) +sequence_length = 57 +vocab_length = 600 +samples = np.load("/g/g92/tran71/ChemBERTa-77M-MLM_smile2M.npy", allow_pickle=True) + train_samples = samples[:int(samples.size*0.8)] val_samples = samples[int(samples.size*0.8):int(samples.size*0.9)] test_samples = samples[int(samples.size*0.9):] - # Masking samples ''' https://github.com/huggingface/transformers/blob/v4.20.1/src/transformers/data/data_collator.py#L805 @@ -83,23 +83,56 @@ def get_val_sample(index): return sample_all - -# Test sample access functions +# Testing sample access functions def get_test_sample(index): sample = test_samples[index] + rand_size = np.random.randint(1, 4) + + if len(sample) < sequence_length: + mask_idx = sorted(np.random.randint(1,len(sample)-1,size=rand_size)) + else: + mask_idx = sorted(np.random.randint(1,sequence_length-1,size=rand_size)) + + + sample_mask = sample.copy() + + op = np.random.randint(0, 3) + + if(op == 0): # Delete augmentation + for idx in mask_idx: + if(idx < len(sample_mask)-2): # Don't mask the last character for deletion. + sample_mask[idx] = mask_index + + del_count = 0 + for idx in mask_idx: + if(idx < len(sample_mask)-1): # Don't delete EoS. + sample_mask = np.delete(sample_mask, idx+1+del_count) + del_count -= 1 + elif(op == 1): # Insert augmentation + for idx in mask_idx: + sample_mask = np.insert(sample_mask, idx, mask_index) + else: # Replace augmentation + for idx in mask_idx: + sample_mask[idx] = mask_index + if len(sample) < sequence_length: sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index))) + else: sample = np.resize(sample, sequence_length) - sample,masked,label = masking(sample) + if len(sample_mask) < sequence_length: + sample_mask = np.concatenate((sample_mask, np.full(sequence_length-len(sample_mask), pad_index))) + + else: + sample_mask = np.resize(sample_mask, sequence_length) sample_all = np.full(3*sequence_length, pad_index, dtype=int) sample_all[0:len(sample)] = sample - sample_all[sequence_length:2*sequence_length] = masked - sample_all[2*sequence_length:3*sequence_length] = label + sample_all[sequence_length:2*sequence_length] = sample_mask + sample_all[2*sequence_length:3*sequence_length] = sample - return sample_all + return sample_all def num_train_samples(): return train_samples.shape[0] @@ -114,6 +147,5 @@ def sample_dims(): return (3*sequence_length+1,) def vocab_size(): - return 767 - + return vocab_length diff --git a/python/lbann/modules/base.py b/python/lbann/modules/base.py index 4338c9524bb..92bae3a28e9 100644 --- a/python/lbann/modules/base.py +++ b/python/lbann/modules/base.py @@ -64,8 +64,8 @@ def __init__(self, weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most two: the matrix and the bias. If weights are not provided, the - matrix will be initialized with Lecun uniform - initialization and the bias with Lecun uniform. + matrix will be initialized with He normal + initialization and the bias with zeros. name (str): Default name is in the form 'fcmodule'. data_layout (str): Data layout. parallel_strategy (dict): Data partitioning scheme. @@ -86,8 +86,8 @@ def __init__(self, # Initialize weights # Note: If weights are not provided, matrix weights are - # initialized with Lecun uniform scheme and bias weights are - # initialized with Lecun uniform scheme. + # initialized with He normal scheme and bias weights are + # initialized with zeros. self.weights = list(make_iterable(weights)) if len(self.weights) > 2: raise ValueError('`FullyConnectedModule` has ' @@ -95,11 +95,11 @@ def __init__(self, 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( - lbann.Weights(initializer=lbann.LeCunUniformInitializer(), + lbann.Weights(initializer=lbann.HeNormalInitializer(), name=self.name+'_matrix')) if self.bias and len(self.weights) == 1: self.weights.append( - lbann.Weights(initializer=lbann.LeCunUniformInitializer(), + lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), name=self.name+'_bias')) # Initialize activation layer @@ -158,9 +158,8 @@ def __init__(self, weights (`Weights` or iterator of `Weights`): Weights in fully-connected layer. There are at most two: the matrix and the bias. If weights are not provided, the - matrix will be initialized with Lecun uniform - initialization and the bias Lecun uniform - initialization. + matrix will be initialized with He normal + initialization and the bias with zeros. activation (type): Layer class for activation function. name (str): Default name is in the form 'channelwisefc'. parallel_strategy (dict): Data partitioning scheme. @@ -184,11 +183,11 @@ def __init__(self, 'but got {0}'.format(len(self.weights))) if len(self.weights) == 0: self.weights.append( - lbann.Weights(initializer=lbann.LeCunUniformInitializer(), + lbann.Weights(initializer=lbann.HeNormalInitializer(), name=self.name+'_matrix')) if self.bias and len(self.weights) == 1: self.weights.append( - lbann.Weights(initializer=lbann.LeCunUniformInitializer(), + lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), name=self.name+'_bias')) self.activation = None if activation: From e7a922c0f1658db2ef2743aa6253479cd59b3fcf Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Anh Tran Date: Wed, 27 Jul 2022 11:34:21 -0700 Subject: [PATCH 5/6] Add evaluation script. --- applications/nlp/Roberta_atom/evaluation.py | 290 ++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 applications/nlp/Roberta_atom/evaluation.py diff --git a/applications/nlp/Roberta_atom/evaluation.py b/applications/nlp/Roberta_atom/evaluation.py new file mode 100644 index 00000000000..9c827cd650b --- /dev/null +++ b/applications/nlp/Roberta_atom/evaluation.py @@ -0,0 +1,290 @@ +import numpy as np +import pandas as pd +import sys +import glob +import torch +import itertools + +from rdkit import Chem +from rdkit import DataStructs +from rdkit.Chem import AllChem +from rdkit import RDLogger + + +seq_length = 57 +vocab_length = 600 + +def detokenize(inp,vocab): + output = "" + for i in inp: + token = list(vocab.keys())[list(vocab.values()).index(int(i))] + + if(token =='[SEP]'): + break + if(token !='[CLS]' and token !='[PAD]'): + output = output+token + + return output + +def softmax(x): + """Compute softmax values for each sets of scores in x.""" + return np.exp(x) / np.sum(np.exp(x), axis=0) + + +def read_csv(fdir): + + input_files = glob.glob(fdir+"*_output0.csv") + + if(len(input_files)> 2): + ins = np.concatenate((np.loadtxt(f,delimiter=",")) for f in input_files) + else: + ins = np.loadtxt(input_files[0], delimiter=",") + + return ins + +def get_masked_index(array,mask_value): + mask_index = [] + for i in range(array.shape[0]): + index = np.where((array[i]==mask_value)==True)[0] + mask_index.append(index) + + return mask_index + +def process_output_topk(input_mask,output,mask_index, k=5): + num_samples = input_mask.shape[0] + process_output = [] + for j in range(num_samples): + all_preds = [] + for i in mask_index[j]: + preds = torch.from_numpy(output[j,vocab_length*i:vocab_length*(i+1)]) + _,preds = torch.topk(preds, k) + all_preds.append(preds.numpy()) + + replace_char = list(itertools.product(*all_preds)) + + for chars in replace_char: + preds_sm = input_mask[j].copy() + for (idx,val) in zip(mask_index[j], chars): + preds_sm[idx] = val + process_output.append(preds_sm) + + return process_output + +def process_input_topk(input,mask_index, k=5): + process_input = [] + num_samples = input.shape[0] + for i in range(num_samples): + for j in range(k**len(mask_index[i])): + process_input.append(input[i]) + + return process_input + + +def get_smiles_from_lbann_tensors(fdir, vocab_path): + + + ################### + # First input files + ################### + + input_files = glob.glob(fdir+"inps.csv") + + ins = np.loadtxt(input_files[0], delimiter=",") + for i, f in enumerate(input_files): + if(i > 0) : + ins = np.concatenate((ins, np.loadtxt(f,delimiter=","))) + + num_cols = ins.shape[1] + num_samples = ins.shape[0] + + vocab = pd.read_csv(vocab_file, delimiter=" ", header=None, quoting=3).to_dict()[0] + vocab = dict([(v,k) for k,v in vocab.items()]) + + samples = [detokenize(i_x,vocab) for i_x in ins[:,0:]] + + + samples = pd.DataFrame(samples, columns=['SMILES']) + + samples.to_csv("gt_"+"smiles.txt", index=False) + + #################### + # Second input files + #################### + + input_files = glob.glob(fdir+"preds.csv") + + ins = np.loadtxt(input_files[0], delimiter=",") + for i, f in enumerate(input_files): + if(i > 0) : + ins = np.concatenate((ins, np.loadtxt(f,delimiter=","))) + + num_cols = ins.shape[1] + num_samples = ins.shape[0] + + vocab = pd.read_csv(vocab_file, delimiter=" ", header=None, quoting=3).to_dict()[0] + vocab = dict([(v,k) for k,v in vocab.items()]) + + samples = [detokenize(i_x,vocab) for i_x in ins[:,0:]] + + samples = pd.DataFrame(samples, columns=['SMILES']) + + samples.to_csv("pred_"+"smiles.txt", index=False) + +def compare_decoded_to_original_smiles(orig_smiles, decoded_smiles, output_file=None): + """ + Compare decoded to original SMILES strings and output a table of Tanimoto distances, along with + binary flags for whether the strings are the same and whether the decoded string is valid SMILES. + orig_smiles and decoded_smiles are lists or arrays of strings. + If an output file name is provided, the table will be written to it as a CSV file. + Returns the table as a DataFrame. + + """ + res_df = pd.DataFrame(dict(original=orig_smiles, decoded=decoded_smiles)) + is_valid = [] + is_same = [] + tani_dist = [] + accuracy = [] + count = 0 + data_size = len(orig_smiles) + for row in res_df.itertuples(): + count = count + 1 + #compute char by char accuracy + hit = 0 + for x, y in zip(row.original, row.decoded): + if x == y: + hit = hit+1 + accuracy.append((hit/len(row.original))*100) + + is_same.append(int(row.decoded == row.original)) + orig_mol = Chem.MolFromSmiles(row.original) + if orig_mol is None: + #Note, input may be invalid, if original SMILE string is truncated + is_valid.append('x') + tani_dist.append(-1) + continue + dec_mol = Chem.MolFromSmiles(row.decoded) + RDLogger.DisableLog('rdApp.*') + if dec_mol is None: + is_valid.append(0) + tani_dist.append(1) + else: + is_valid.append(1) + orig_fp = AllChem.GetMorganFingerprintAsBitVect(orig_mol, 2, 1024) + dec_fp = AllChem.GetMorganFingerprintAsBitVect(dec_mol, 2, 1024) + tani_sim = DataStructs.FingerprintSimilarity(orig_fp, dec_fp, metric=DataStructs.TanimotoSimilarity) + tani_dist.append(1.0 - tani_sim) + res_df['is_valid'] = is_valid + res_df['is_same'] = is_same + res_df['smile_accuracy'] = accuracy + res_df['tanimoto_distance'] = tani_dist + global_acc = np.mean(np.array(accuracy)) + res_df['total_avg_accuracy'] = [global_acc]*len(accuracy) + + #print("Mean global accuracy % ", global_acc) + #print("Validity % ", (is_valid.count(1)/data_size)*100) + #print("Same % ", (is_same.count(1)/data_size)*100) + valid_tani_dist = [ t for t in tani_dist if t >= 0 ] + #print("Average tanimoto ", np.mean(np.array(valid_tani_dist))) + + + if output_file is not None: + output_columns = ['original', 'decoded', 'is_valid', 'is_same', 'smile_accuracy','tanimoto_distance','total_avg_accuracy'] + res_df.to_csv(output_file, index=False, columns=output_columns) + return(res_df) + +def read_smiles_csv(path): + return pd.read_csv(path,usecols=['SMILES'], + squeeze=True).astype(str).tolist() + +def get_valid_single_string(ins): + for i in range(len(ins)): + if(ins[i,2] == 'x'): + return ins[i] + elif(ins[i,2] == '1'): + return ins[i] + + return ins[0] + +def get_valid(ins,mask_index,k=5): + process_ins = [] + num_samples = len(mask_index) + min_index = 0 + for i in range(num_samples): + max_index = min_index + k**len(mask_index[i]) + string = get_valid_single_string(ins[min_index:max_index]) + process_ins.append(string) + min_index = max_index + + return process_ins + +def get_metric(ins): + + valid = ins[:,2].flatten() + same = ins[:,3].flatten() + + total_acc = 0 + total_same = 0 + x_value = 0 + + for i in range(len(valid)): + if(valid[i] == '1'): + total_acc += 1 + elif(valid[i] == 'x'): + x_value +=1 + if(same[i] == 1): + total_same += 1 + + return total_acc/(len(valid)-x_value), total_same/(len(same)-x_value) + +# Files directory +fdir_ouput='output/' +fdir_input='input/' +fdir_mask='mask/' +fdir='' +vocab_file= 'vocab_600.txt' + + +# Get masked smile +mask = 14 +input_masked_smile = read_csv(fdir_mask) +mask_index = get_masked_index(input_masked_smile,mask) + +# Save input +input_smile = read_csv(fdir_input) +processed_input = process_input_topk(input_smile,mask_index) +np.savetxt('inps.csv', processed_input, delimiter=',', fmt ='% s') + +# Save output +output = read_csv(fdir_ouput) +processed_output = process_output_topk(input_masked_smile,output,mask_index) +np.savetxt('preds.csv', processed_output, delimiter=',', fmt ='% s') + +# Calculate validity +get_smiles_from_lbann_tensors(fdir, vocab_file) + +orig_file = read_smiles_csv("gt_smiles.txt") +pred_file = read_smiles_csv("pred_smiles.txt") +diff_file = "sd"+"_smiles_metrics.csv" +final_file = "smile.csv" + +print("Input/pred SMILES file sizes ", len(orig_file), " ", len(pred_file)) + +compare_decoded_to_original_smiles(orig_file, pred_file, diff_file) + +ins = pd.read_csv(diff_file,delimiter=",") +ins = ins.to_numpy() + +process_ins = get_valid(ins,mask_index) +print("Input/pred SMILES diff file saved to", diff_file) + +np.savetxt(final_file, process_ins, delimiter=',', fmt ='% s') + +ins = pd.read_csv(final_file,delimiter=",") +ins = ins.to_numpy() + +valid,same = get_metric(ins) + +print("Validity % ", valid*100) +print("Same % ", same*100) + + From 6fc19016a7e36dc5659e447919c68d57af030c62 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Anh Tran Date: Wed, 27 Jul 2022 11:36:29 -0700 Subject: [PATCH 6/6] Update model config. --- applications/nlp/Roberta_atom/config.json | 26 +++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/applications/nlp/Roberta_atom/config.json b/applications/nlp/Roberta_atom/config.json index 7d37f79227b..43d59cbaa38 100644 --- a/applications/nlp/Roberta_atom/config.json +++ b/applications/nlp/Roberta_atom/config.json @@ -2,21 +2,25 @@ "architectures": [ "RobertaForMaskedLM" ], - "attention_probs_dropout_prob": 0.1, - "bos_token_id": 0, - "eos_token_id": 2, + "attention_probs_dropout_prob": 0.109, + "bos_token_id": 12, + "eos_token_id": 13, + "gradient_checkpointing": false, "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, + "hidden_dropout_prob": 0.144, + "hidden_size": 384, "initializer_range": 0.02, - "intermediate_size": 3072, - "layer_norm_eps": 1e-05, - "max_position_embeddings": 514, + "intermediate_size": 464, + "is_gpu": true, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 515, "model_type": "roberta", "num_attention_heads": 12, - "num_hidden_layers": 6, - "output_past": true, + "num_hidden_layers": 3, "pad_token_id": 1, + "position_embedding_type": "absolute", + "transformers_version": "4.6.1", "type_vocab_size": 1, - "vocab_size": 767 + "use_cache": true, + "vocab_size": 600 }