From 1e4f99b063c145e2ff151c6b33af1bdeea5ed851 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen Anh Tran <tran71@lassen708.coral.llnl.gov>
Date: Mon, 27 Jun 2022 12:24:59 -0700
Subject: [PATCH 1/6] Add roberta implementation for MLM using smile strings.

---
 applications/nlp/Roberta_atom/config.json     |  22 ++
 applications/nlp/Roberta_atom/dataset.py      | 130 ++++++++
 .../nlp/Roberta_atom/get_model_config.py      | 114 +++++++
 applications/nlp/Roberta_atom/main.py         | 311 ++++++++++++++++++
 python/lbann/models/__init__.py               |   2 +-
 python/lbann/models/roberta.py                |  90 ++++-
 python/lbann/modules/base.py                  |  21 +-
 7 files changed, 678 insertions(+), 12 deletions(-)
 create mode 100644 applications/nlp/Roberta_atom/config.json
 create mode 100644 applications/nlp/Roberta_atom/dataset.py
 create mode 100644 applications/nlp/Roberta_atom/get_model_config.py
 create mode 100644 applications/nlp/Roberta_atom/main.py

diff --git a/applications/nlp/Roberta_atom/config.json b/applications/nlp/Roberta_atom/config.json
new file mode 100644
index 00000000000..7d37f79227b
--- /dev/null
+++ b/applications/nlp/Roberta_atom/config.json
@@ -0,0 +1,22 @@
+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "output_past": true,
+  "pad_token_id": 1,
+  "type_vocab_size": 1,
+  "vocab_size": 767
+}
diff --git a/applications/nlp/Roberta_atom/dataset.py b/applications/nlp/Roberta_atom/dataset.py
new file mode 100644
index 00000000000..09dca567863
--- /dev/null
+++ b/applications/nlp/Roberta_atom/dataset.py
@@ -0,0 +1,130 @@
+import numpy as np
+
+bos_index = 0
+eos_index = 2
+pad_index = 1
+ignore_index = -100
+mask_index = 4
+mask_percent = 0.15
+
+sequence_length = 48
+vocab_length = 767
+samples = np.load("/g/g92/tran71/tran71/lbann_new/applications/nlp/Roberta_zinc_base/zinc250k.npy", allow_pickle=True) 
+
+train_samples = samples[:int(samples.size*0.8)]
+val_samples = samples[int(samples.size*0.8):int(samples.size*0.9)]
+test_samples = samples[int(samples.size*0.9):]
+
+
+# Masking samples
+def masking(sample):
+    sample_masked = sample.copy()
+    rand = np.random.uniform(size=(1,sequence_length))
+    replace = (rand < mask_percent) * (sample != bos_index) * (sample != eos_index) * (sample != pad_index) 
+    mask_idx = np.nonzero(replace)[1]
+    for idx in mask_idx:
+        chance = np.random.uniform()
+        if(chance < 0.1): #replace with random character excluding special characters
+            sample_masked[idx] = np.random.randint(5,vocab_length) 
+        elif (0.1 < chance < 0.9): #replace with mask character
+            sample_masked[idx] = mask_index 
+    return sample_masked,mask_idx
+
+# Train sample access functions
+def get_train_sample(index):
+    sample = train_samples[index]
+    if len(sample) < sequence_length:
+        sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index)))
+    else:
+        sample = np.resize(sample, sequence_length)
+
+    sample_mask, mask_idx = masking(sample)
+
+    idx = [i for i in range(0,sequence_length)]
+    non_mask_idx = [i for i in idx if (i not in mask_idx)]
+
+    label  = sample.copy()
+
+    label[non_mask_idx] = ignore_index
+
+    sample_all = np.full(3*sequence_length, pad_index, dtype=int)
+    sample_all[0:len(sample)] = sample
+    sample_all[sequence_length:2*sequence_length] = sample_mask
+    sample_all[2*sequence_length:3*sequence_length] = label
+
+    return sample_all
+
+
+# Validation sample access functions
+def get_val_sample(index):
+    sample = val_samples[index]
+    if len(sample) < sequence_length:
+        sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index)))
+    else:
+        sample = np.resize(sample, sequence_length)
+
+    mask_idx = np.random.randint(0,sequence_length)
+    #print(mask_idx)
+
+    sample_mask = sample.copy()
+    sample_mask[mask_idx] = 14
+
+    idx = [i for i in range(0,sequence_length)]
+    non_mask_idx = [i for i in idx if (i != mask_idx)]
+    #print(non_mask_idx)
+
+    label  = sample.copy()
+
+    label[non_mask_idx] = ignore_index
+
+    sample_all = np.full(3*sequence_length, pad_index, dtype=int)
+    sample_all[0:len(sample)] = sample
+    sample_all[sequence_length:2*sequence_length] = sample_mask
+    sample_all[2*sequence_length:3*sequence_length] = label
+
+    return sample_all
+
+
+# Test sample access functions
+def get_test_sample(index):
+    sample = test_samples[index]
+    if len(sample) < sequence_length:
+        sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index)))
+    else:
+        sample = np.resize(sample, sequence_length)
+
+    mask_idx = np.random.randint(0,sequence_length)
+
+    sample_mask = sample.copy()
+    sample_mask[mask_idx] = 14
+
+    idx = [i for i in range(0,sequence_length)]
+    non_mask_idx = [i for i in idx if (i != mask_idx)]
+
+    label  = sample.copy()
+
+    label[non_mask_idx] = ignore_index
+
+    sample_all = np.full(3*sequence_length, pad_index, dtype=int)
+    sample_all[0:len(sample)] = sample
+    sample_all[sequence_length:2*sequence_length] = sample_mask
+    sample_all[2*sequence_length:3*sequence_length] = label
+
+    return sample_all
+
+def num_train_samples():
+    return train_samples.shape[0]
+
+def num_val_samples():
+    return val_samples.shape[0]
+
+def num_test_samples():
+    return val_samples.shape[0]
+
+def sample_dims():
+    return (3*sequence_length+1,)
+
+def vocab_size():
+    return 767
+
+
diff --git a/applications/nlp/Roberta_atom/get_model_config.py b/applications/nlp/Roberta_atom/get_model_config.py
new file mode 100644
index 00000000000..fca0bd5d50b
--- /dev/null
+++ b/applications/nlp/Roberta_atom/get_model_config.py
@@ -0,0 +1,114 @@
+import sys
+import os
+import warnings
+import itertools
+import time
+import glob
+import urllib.request
+import argparse
+
+import numpy as np
+import torch
+
+files = {
+    "config.json": "https://huggingface.co/seyonec/ChemBERTa-zinc-base-v1/resolve/main/config.json",
+    "pytorch_model.bin": "https://huggingface.co/seyonec/ChemBERTa-zinc-base-v1/resolve/main/pytorch_model.bin",
+}
+weights_dir = "pretrained_weights"
+
+
+def download_file(url, fn):
+    def report_hook(count, block_size, total_size):
+        duration = int(time.time() - start_time)
+        progress_size = int(count * block_size / (1024 ** 2))
+        percent = min(int(count * block_size * 100 / total_size), 100)
+        prog_bar = "|" + "#" * int(percent / 2) + "-" * (50 - int(percent / 2)) + "|"
+        sys.stdout.write(
+            f"\r{prog_bar} {percent}%, {progress_size} MB, {duration}s elapsed"
+        )
+        sys.stdout.flush()
+
+    if os.path.exists(fn):
+        warnings.warn(f"File '{fn}' already exists, skipping download")
+    else:
+        print(f"\n\nDownloading {fn} from {url}\n")
+        start_time = time.time()
+        urllib.request.urlretrieve(url, fn, report_hook)
+
+
+def extract_weights(model, weights_dir):
+    for name, weights in model.items():
+        weights = np.array(weights).astype(np.float32)
+        np.save(f"./{weights_dir}/{name}.npy", weights)
+
+
+def process_weights(weights_dir):
+    # Combine layernorm weights and bias to single file
+    layernorm_files = glob.glob(f"./{weights_dir}/*LayerNorm*.npy")
+    layernorm_groups = {}
+    for fn in layernorm_files:
+        base_fn = fn.split(".LayerNorm")[0]
+        if base_fn in layernorm_groups:
+            layernorm_groups[base_fn].append(fn)
+        else:
+            layernorm_groups[base_fn] = [fn]
+
+    for base_fn, fns in layernorm_groups.items():
+        weight_fn = [fn for fn in fns if "weight.npy" in fn][0]
+        bias_fn = [fn for fn in fns if "bias.npy" in fn][0]
+
+        weight_bias_vals = np.stack([np.load(weight_fn), np.load(bias_fn)]).T.copy()
+        np.save(f"{base_fn}.layernorm.weightbias.npy", weight_bias_vals)
+
+    # Combine layer_norm weights and bias to single file
+    layer_norm_files = glob.glob(f"./{weights_dir}/*layer_norm*.npy")
+    layer_norm_groups = {}
+    for fn in layer_norm_files:
+        base_fn = fn.split(".layer_norm")[0]
+        if base_fn in layer_norm_groups:
+            layer_norm_groups[base_fn].append(fn)
+        else:
+            layer_norm_groups[base_fn] = [fn]
+
+    for base_fn, fns in layer_norm_groups.items():
+        weight_fn = [fn for fn in fns if "weight.npy" in fn][0]
+        bias_fn = [fn for fn in fns if "bias.npy" in fn][0]
+
+        weight_bias_vals = np.stack([np.load(weight_fn), np.load(bias_fn)]).T.copy()
+        np.save(f"{base_fn}.layer_norm.weightbias.npy", weight_bias_vals)
+
+    # Transpose embedding layer weights
+    embed_files = [
+        glob.glob(f"{weights_dir}/{e}.npy")
+        for e in (
+            "*position_embeddings*",
+            "*token_type_embeddings*",
+            "*word_embeddings*",
+        )
+    ]
+    embed_files = itertools.chain(*embed_files)
+    for fn in embed_files:
+        np.save(fn, np.load(fn).T.copy())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--no-weights', action='store_true', help='avoids downloading model weights')
+    args = parser.parse_args()
+
+    if args.no_weights:
+        del files['pytorch_model.bin']
+
+    """Download model from huggingface"""
+    for fn, url in files.items():
+        download_file(url, fn)
+
+    if not args.no_weights:
+        """ Extract weights """
+        if not os.path.exists(weights_dir):
+            os.makedirs(weights_dir)
+        model = torch.load("pytorch_model.bin", map_location="cpu")
+        extract_weights(model, weights_dir)
+
+        """ Process weights for loading into LBANN """
+        process_weights(weights_dir)
diff --git a/applications/nlp/Roberta_atom/main.py b/applications/nlp/Roberta_atom/main.py
new file mode 100644
index 00000000000..595afe688d7
--- /dev/null
+++ b/applications/nlp/Roberta_atom/main.py
@@ -0,0 +1,311 @@
+from types import SimpleNamespace
+import argparse
+import datetime
+import os
+import sys
+import json
+import numpy as np
+
+import lbann
+import lbann.contrib.args
+import lbann.contrib.launcher
+
+from lbann.models import RoBERTaMLM
+
+
+# Local imports
+current_dir = os.path.dirname(os.path.realpath(__file__))
+root_dir = os.path.dirname(current_dir)
+sys.path.append(root_dir)
+import utils.paths
+
+
+
+import dataset
+# Dataset properties
+vocab_size = dataset.vocab_size()
+sequence_length = dataset.sequence_length
+pad_index = dataset.pad_index
+ignore_index = dataset.ignore_index
+
+# ----------------------------------------------
+# Options
+# ----------------------------------------------
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--epochs",
+    default=51,
+    type=int,
+    help="number of epochs to train",
+)
+parser.add_argument(
+    "--mini-batch-size",
+    default=256,
+    type=int,
+    help="size of minibatches for training",
+)
+parser.add_argument(
+    "--job-name",
+    action="store",
+    default="RoBERTa_MLM",
+    type=str,
+    help="scheduler job name",
+    metavar="NAME",
+)
+parser.add_argument(
+    "--work-dir",
+    action="store",
+    default=None,
+    type=str,
+    help="working directory",
+    metavar="DIR",
+)
+parser.add_argument("--batch-job", action="store_true", help="submit as batch job")
+parser.add_argument(
+    "--checkpoint", action="store_true", help="checkpoint trainer after every epoch"
+)
+lbann.contrib.args.add_scheduler_arguments(parser)
+lbann_params = parser.parse_args()
+
+
+
+# ----------------------------------------------
+# Work directory
+# ----------------------------------------------
+
+timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+work_dir = os.path.join(
+    utils.paths.root_dir(),
+    'Roberta_atom/exps',
+    f'{timestamp}_{lbann_params.job_name}',
+)
+os.makedirs(work_dir, exist_ok=True)
+
+
+
+# ----------------------------------------------
+# Data Reader
+# ----------------------------------------------
+def make_data_reader():
+    reader = lbann.reader_pb2.DataReader()
+
+    # Train data reader
+    _reader = reader.reader.add()
+    _reader.name = "python"
+    _reader.role = "train"
+    _reader.shuffle = True
+    _reader.percent_of_data_to_use = 1.0
+    _reader.python.module = "dataset"
+    _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    _reader.python.sample_function = "get_train_sample"
+    _reader.python.num_samples_function = "num_train_samples"
+    _reader.python.sample_dims_function = "sample_dims"
+
+    # Validation data reader
+    _reader = reader.reader.add()
+    _reader.name = "python"
+    _reader.role = "validate"
+    _reader.shuffle = False
+    _reader.percent_of_data_to_use = 1.0
+    _reader.python.module = "dataset"
+    _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    _reader.python.sample_function = "get_val_sample"
+    _reader.python.num_samples_function = "num_val_samples"
+    _reader.python.sample_dims_function = "sample_dims"
+
+    # Test data reader
+    _reader = reader.reader.add()
+    _reader.name = "python"
+    _reader.role = "test"
+    _reader.shuffle = False
+    _reader.percent_of_data_to_use = 1.0
+    _reader.python.module = "dataset"
+    _reader.python.module_dir = os.path.dirname(os.path.realpath(__file__))
+    _reader.python.sample_function = "get_test_sample"
+    _reader.python.num_samples_function = "num_test_samples"
+    _reader.python.sample_dims_function = "sample_dims"
+
+    return reader
+
+
+
+# ----------------------------------------------
+# Build and Run Model
+# ----------------------------------------------
+with open("./config.json") as f:
+    config = json.load(f, object_hook=lambda d: SimpleNamespace(**d))
+config.input_shape = (1,sequence_length)
+
+config.load_weights = os.path.abspath('./pretrained_weights')
+
+
+# Construct the model
+
+# Input is 3 sequences of smile string: original string, masked string, label string - every token is -100 (ignore) except the masked token. 
+input_ = lbann.Input(data_field='samples')
+
+
+input_strings = lbann.Identity(lbann.Slice(
+	input_,
+	axis=0,
+	slice_points=[0,sequence_length],
+	name='input_strings'
+))
+
+input_masked = lbann.Identity(lbann.Slice(
+	input_,
+	axis=0,
+	slice_points=[sequence_length,2*sequence_length],
+	name='input_masked'
+))
+
+input_label = lbann.Identity(lbann.Slice(
+	input_,
+	axis=0,
+	slice_points=[2*sequence_length,3*sequence_length],
+	name='input_label'
+))
+
+
+robertamlm = RoBERTaMLM(config,load_weights=config.load_weights)
+output = robertamlm(input_masked)
+
+preds_output = lbann.Identity(output,name='pred')
+
+preds = lbann.ChannelwiseSoftmax(output, name='pred_sm')
+preds = lbann.Slice(preds, axis=1, slice_points=range(sequence_length+1),name='slice_pred')
+preds = [lbann.Identity(preds) for _ in range(sequence_length)]
+
+
+########
+# Loss
+########
+
+# Count number of non-pad tokens
+label_tokens = lbann.Identity(input_label)
+pads = lbann.Constant(value=ignore_index, num_neurons=sequence_length,name='pads')
+
+is_not_pad = lbann.NotEqual(label_tokens, pads,name='is_not_pad')
+num_not_pad = lbann.Reduction(is_not_pad, mode='sum',name='num_not_pad')
+
+# Cross entropy loss 
+label_tokens = lbann.Slice(
+	label_tokens,
+        slice_points=range(sequence_length+1),
+	name='label_tokens',
+    )
+
+label_tokens = [lbann.Identity(label_tokens) for _ in range(sequence_length)]
+
+loss = []
+
+
+for i in range(sequence_length):
+	label = lbann.OneHot(label_tokens[i], size=config.vocab_size)
+	label = lbann.Reshape(label, dims=[config.vocab_size])
+	pred = lbann.Reshape(preds[i], dims=[config.vocab_size]) 
+	loss.append(lbann.CrossEntropy(pred, label))
+
+
+loss = lbann.Concatenation(loss)
+
+
+# Average cross entropy over non-pad tokens
+loss_scales = lbann.SafeDivide(
+	is_not_pad,
+        lbann.Tessellate(num_not_pad, hint_layer=is_not_pad),
+        name = 'loss_scale',
+    )
+loss = lbann.Multiply(loss, loss_scales)
+
+
+obj = lbann.Reduction(loss, mode='sum',name='loss_red')
+
+metrics = [lbann.Metric(obj, name="loss")]
+
+
+###########
+# Callbacks
+###########
+
+callbacks = [lbann.CallbackPrint(),
+             lbann.CallbackTimer(),]
+
+
+callbacks.append(
+	lbann.CallbackDumpOutputs(
+		batch_interval=782,
+		execution_modes='train', 
+		directory=os.path.join(work_dir, 'train_input'),
+		layers='input_strings')
+    )
+
+
+callbacks.append(
+	lbann.CallbackDumpOutputs(
+		batch_interval=782,
+		execution_modes='train', 
+		directory=os.path.join(work_dir, 'train_output'),
+		layers='pred ')
+    )
+
+callbacks.append(
+	lbann.CallbackDumpOutputs(
+		batch_interval=50,
+		execution_modes='test', 
+		directory=os.path.join(work_dir, 'test_input'),
+		layers='input_strings')
+    )
+
+callbacks.append(
+	lbann.CallbackDumpOutputs(
+		batch_interval=50,
+		execution_modes='test', 
+		directory=os.path.join(work_dir, 'test_output'),
+		layers='pred')
+    )
+
+
+callbacks.append(
+	lbann.CallbackDumpWeights(
+		directory=os.path.join(work_dir, 'weights'),
+		epoch_interval=1,
+	)
+    )
+
+
+model = lbann.Model(
+    lbann_params.epochs,
+    layers=lbann.traverse_layer_graph(input_),
+    objective_function=obj,
+    metrics=metrics,
+    callbacks=callbacks,
+)
+
+# Setup trainer, optimizer, data_reader
+trainer = lbann.Trainer(
+    mini_batch_size=lbann_params.mini_batch_size,
+    num_parallel_readers=1,
+)
+optimizer = lbann.Adam(
+    learn_rate=0.0001,
+    beta1=0.9,
+    beta2=0.98,
+    eps=1e-8,
+)
+data_reader = make_data_reader()
+
+# Launch LBANN
+kwargs = lbann.contrib.args.get_scheduler_kwargs(lbann_params)
+kwargs["environment"] = {}
+lbann.contrib.launcher.run(
+    trainer,
+    model,
+    data_reader,
+    optimizer,
+    work_dir=work_dir,
+    job_name=lbann_params.job_name,
+    lbann_args=["--num_io_threads=1"],
+    batch_job=lbann_params.batch_job,
+    **kwargs,
+)
diff --git a/python/lbann/models/__init__.py b/python/lbann/models/__init__.py
index 80d84d536c0..7281cb85453 100644
--- a/python/lbann/models/__init__.py
+++ b/python/lbann/models/__init__.py
@@ -1,5 +1,5 @@
 from lbann.models.alexnet import AlexNet
 from lbann.models.lenet import LeNet
 from lbann.models.resnet import ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
-from lbann.models.roberta import RoBERTa
+from lbann.models.roberta import RoBERTa, RoBERTaMLM
 from lbann.models.transformer import Transformer, TransformerEncoderLayer, TransformerDecoderLayer
diff --git a/python/lbann/models/roberta.py b/python/lbann/models/roberta.py
index b00d933b4cd..72cfd0b4eb5 100644
--- a/python/lbann/models/roberta.py
+++ b/python/lbann/models/roberta.py
@@ -16,6 +16,7 @@
 }
 
 
+
 def create_position_ids_from_input_ids(
     input_ids, input_shape, padding_idx, past_key_values_length=0
 ):
@@ -58,7 +59,6 @@ def _load_pretrained_weights(
         weights = weights[0]
     return weights
 
-
 class RobertaEmbeddings(lbann.modules.Module):
     def __init__(self, config, name, load_weights=True):
         super().__init__()
@@ -627,3 +627,91 @@ def forward(
             return pooled_output
         else:
             return encoder_output
+
+class RobertaLMHead(lbann.modules.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config, name,load_weights=True):
+        self.config = config
+
+        # A custom directory can be passed instead of True/False
+        if isinstance(load_weights, str):
+            if not os.path.isdir(load_weights):
+                raise ValueError(f"Path to pretrained weights does not exist: {load_weights}")
+
+        self.input_shape = config.input_shape + (config.hidden_size,)
+        self.hidden_size = config.hidden_size
+        self.vocab_size = config.vocab_size
+        self.hidden_dropout_prob = config.hidden_dropout_prob
+        self.layer_norm_eps = config.layer_norm_eps
+        self.name = name
+        self.load_weights = load_weights
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        
+    def forward(self, input_tensor):
+        
+        #x = self.dense(features)
+        hidden_states, hidden_shape = lbann.modules.PytorchLinear(
+            input_tensor,
+            self.input_shape,
+            self.hidden_size,
+            weights=_load_pretrained_weights(
+                ".".join((self.name, "dense.weight")),
+                ".".join((self.name, "dense.bias")),
+                load_weights=self.load_weights,
+            ),
+            name=".".join((self.name, "dense")),
+            return_dims=True,
+        )
+        
+        #x = gelu(x)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        #x = self.layer_norm(x)
+        hidden_states = lbann.modules.PytorchLayerNorm(
+            lbann.Add(hidden_states, input_tensor),
+            self.layer_norm_eps,
+            hidden_shape,
+            weights=_load_pretrained_weights(
+                ".".join((self.name, "layer_norm.weightbias")),
+                load_weights=self.load_weights,
+            ),
+            name=".".join((self.name, "layer_norm")),
+        )
+
+        #x = self.decoder(x)
+        hidden_states, hidden_shape = lbann.modules.PytorchLinear(
+            input_tensor,
+            hidden_shape,
+            self.vocab_size,
+            weights=_load_pretrained_weights(
+                ".".join((self.name, "decoder.weight")),
+                ".".join((self.name, "decoder.bias")),
+                load_weights=self.load_weights,
+            ),
+            name=".".join((self.name, "decoder")),
+            return_dims=True,
+        )
+
+        return hidden_states
+
+class RoBERTaMLM(lbann.modules.Module):
+    def __init__(self, config, load_weights=True):
+
+        # A custom directory can be passed instead of True/False
+        if isinstance(load_weights, str):
+            if not os.path.isdir(load_weights):
+                raise ValueError(f"Path to pretrained weights does not exist: {load_weights}")        
+                
+        self.roberta = RoBERTa(config, add_pooling_layer=False, load_weights=load_weights)
+        self.lm_head = RobertaLMHead(config, "lm_head",load_weights=load_weights)
+        
+    def forward(self,input_ids):
+            
+        output = self.roberta(input_ids)
+        output = self.lm_head(output)
+    
+        return output
diff --git a/python/lbann/modules/base.py b/python/lbann/modules/base.py
index 92bae3a28e9..4338c9524bb 100644
--- a/python/lbann/modules/base.py
+++ b/python/lbann/modules/base.py
@@ -64,8 +64,8 @@ def __init__(self,
             weights (`Weights` or iterator of `Weights`): Weights in
                 fully-connected layer. There are at most two: the
                 matrix and the bias. If weights are not provided, the
-                matrix will be initialized with He normal
-                initialization and the bias with zeros.
+                matrix will be initialized with Lecun uniform
+                initialization and the bias with Lecun uniform.
             name (str): Default name is in the form 'fcmodule<index>'.
             data_layout (str): Data layout.
             parallel_strategy (dict): Data partitioning scheme.
@@ -86,8 +86,8 @@ def __init__(self,
 
         # Initialize weights
         # Note: If weights are not provided, matrix weights are
-        # initialized with He normal scheme and bias weights are
-        # initialized with zeros.
+        # initialized with Lecun uniform scheme and bias weights are
+        # initialized with Lecun uniform scheme.
         self.weights = list(make_iterable(weights))
         if len(self.weights) > 2:
             raise ValueError('`FullyConnectedModule` has '
@@ -95,11 +95,11 @@ def __init__(self,
                              'but got {0}'.format(len(self.weights)))
         if len(self.weights) == 0:
             self.weights.append(
-                lbann.Weights(initializer=lbann.HeNormalInitializer(),
+                 lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
                               name=self.name+'_matrix'))
         if self.bias and len(self.weights) == 1:
             self.weights.append(
-                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
+                lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
                               name=self.name+'_bias'))
 
         # Initialize activation layer
@@ -158,8 +158,9 @@ def __init__(self,
         weights (`Weights` or iterator of `Weights`): Weights in
                 fully-connected layer. There are at most two: the
                 matrix and the bias. If weights are not provided, the
-                matrix will be initialized with He normal
-                initialization and the bias with zeros.
+                matrix will be initialized with Lecun uniform 
+                initialization and the bias Lecun uniform 
+                initialization.
         activation (type): Layer class for activation function.
         name (str): Default name is in the form 'channelwisefc<index>'.
         parallel_strategy (dict): Data partitioning scheme.
@@ -183,11 +184,11 @@ def __init__(self,
                          'but got {0}'.format(len(self.weights)))
     if len(self.weights) == 0:
         self.weights.append(
-            lbann.Weights(initializer=lbann.HeNormalInitializer(),
+             lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
                           name=self.name+'_matrix'))
     if self.bias and len(self.weights) == 1:
         self.weights.append(
-            lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
+            lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
                           name=self.name+'_bias'))
     self.activation = None
     if activation:

From 0811233c174ab8a6429660b18b33def4d56ad7e1 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen Anh Tran <tran71@lassen709.coral.llnl.gov>
Date: Mon, 27 Jun 2022 12:41:02 -0700
Subject: [PATCH 2/6] Add re-tokenizing script.

---
 applications/nlp/Roberta_atom/re_tokenize.py | 53 ++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 applications/nlp/Roberta_atom/re_tokenize.py

diff --git a/applications/nlp/Roberta_atom/re_tokenize.py b/applications/nlp/Roberta_atom/re_tokenize.py
new file mode 100644
index 00000000000..3207e98d030
--- /dev/null
+++ b/applications/nlp/Roberta_atom/re_tokenize.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pandas as pd
+
+from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline
+
+'''
+Re-tokenize SMILE string using Huggingface tokenizers.
+
+'''
+
+model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
+tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
+
+  
+def detokenize(inp,vocab):
+  '''
+  Convert the tokenized zinc to SMILE strings 
+  '''
+  output = ""
+  for i in inp: 
+    token = list(vocab.keys())[list(vocab.values()).index(int(i))]
+    if(token =='<eos>'):
+      break
+    if(token[0]!='<'):
+      output = output+token
+
+  return output
+
+
+samples = np.load("moses_zinc_train250K.npy", allow_pickle=True) 
+
+vocab_file = "vocab_train.txt"
+
+vocab = pd.read_csv(vocab_file, delimiter=" ", header=None, quoting=3).to_dict()[0]
+vocab = dict([(v,k) for k,v in vocab.items()])
+
+
+samples = [detokenize(i_x,vocab) for i_x in samples] 
+
+print(len(samples))
+
+smiles_tokenized=[]
+
+for s in samples:
+    tokenize = tokenizer.encode(s)
+    del tokenize[-2] # remove extra character before <eos>
+    smiles_tokenized.append(tokenize)
+
+print(len(smiles_tokenized))
+    
+outfile = "zinc250k.npy"
+
+np.save(outfile, smiles_tokenized)

From f149e8226698460ad4deecaf9733922ea98618f1 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen Anh Tran <tran71@lassen709.coral.llnl.gov>
Date: Wed, 29 Jun 2022 13:11:48 -0700
Subject: [PATCH 3/6] Updating masking function based on HuggingFace
 implementation.

---
 applications/nlp/Roberta_atom/dataset.py | 83 ++++++++++--------------
 1 file changed, 36 insertions(+), 47 deletions(-)

diff --git a/applications/nlp/Roberta_atom/dataset.py b/applications/nlp/Roberta_atom/dataset.py
index 09dca567863..483ff6bc3bf 100644
--- a/applications/nlp/Roberta_atom/dataset.py
+++ b/applications/nlp/Roberta_atom/dataset.py
@@ -16,19 +16,37 @@
 test_samples = samples[int(samples.size*0.9):]
 
 
+
 # Masking samples
-def masking(sample):
-    sample_masked = sample.copy()
-    rand = np.random.uniform(size=(1,sequence_length))
-    replace = (rand < mask_percent) * (sample != bos_index) * (sample != eos_index) * (sample != pad_index) 
-    mask_idx = np.nonzero(replace)[1]
-    for idx in mask_idx:
-        chance = np.random.uniform()
-        if(chance < 0.1): #replace with random character excluding special characters
-            sample_masked[idx] = np.random.randint(5,vocab_length) 
-        elif (0.1 < chance < 0.9): #replace with mask character
-            sample_masked[idx] = mask_index 
-    return sample_masked,mask_idx
+'''
+https://github.com/huggingface/transformers/blob/v4.20.1/src/transformers/data/data_collator.py#L805
+'''
+def masking(sample, mlm_probability = 0.15):
+
+  masked = np.copy(sample)
+  label = np.copy(sample) 
+
+  special_tokens_mask = (sample == bos_index) + (sample == eos_index) + (sample == pad_index) 
+  
+  probability_matrix = np.full(sample.shape, mlm_probability)
+
+  probability_matrix[special_tokens_mask] = 0
+
+  masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
+
+  label[~masked_indices] = ignore_index
+
+  indices_replaced = np.random.binomial(1, 0.8, size=sample.shape).astype(bool) & masked_indices
+
+  masked[indices_replaced] = mask_index
+
+  indices_random = (np.random.binomial(1, 0.5, size=sample.shape).astype(bool) & masked_indices & ~indices_replaced)
+
+  random_words = np.random.randint(low=5, high=vocab_length, size=np.count_nonzero(indices_random), dtype=np.int64)
+
+  masked[indices_random] = random_words
+
+  return sample,masked,label
 
 # Train sample access functions
 def get_train_sample(index):
@@ -38,18 +56,11 @@ def get_train_sample(index):
     else:
         sample = np.resize(sample, sequence_length)
 
-    sample_mask, mask_idx = masking(sample)
-
-    idx = [i for i in range(0,sequence_length)]
-    non_mask_idx = [i for i in idx if (i not in mask_idx)]
-
-    label  = sample.copy()
-
-    label[non_mask_idx] = ignore_index
+    sample,masked,label = masking(sample)
 
     sample_all = np.full(3*sequence_length, pad_index, dtype=int)
     sample_all[0:len(sample)] = sample
-    sample_all[sequence_length:2*sequence_length] = sample_mask
+    sample_all[sequence_length:2*sequence_length] = masked
     sample_all[2*sequence_length:3*sequence_length] = label
 
     return sample_all
@@ -63,23 +74,11 @@ def get_val_sample(index):
     else:
         sample = np.resize(sample, sequence_length)
 
-    mask_idx = np.random.randint(0,sequence_length)
-    #print(mask_idx)
-
-    sample_mask = sample.copy()
-    sample_mask[mask_idx] = 14
-
-    idx = [i for i in range(0,sequence_length)]
-    non_mask_idx = [i for i in idx if (i != mask_idx)]
-    #print(non_mask_idx)
-
-    label  = sample.copy()
-
-    label[non_mask_idx] = ignore_index
+    sample,masked,label = masking(sample)
 
     sample_all = np.full(3*sequence_length, pad_index, dtype=int)
     sample_all[0:len(sample)] = sample
-    sample_all[sequence_length:2*sequence_length] = sample_mask
+    sample_all[sequence_length:2*sequence_length] = masked
     sample_all[2*sequence_length:3*sequence_length] = label
 
     return sample_all
@@ -93,21 +92,11 @@ def get_test_sample(index):
     else:
         sample = np.resize(sample, sequence_length)
 
-    mask_idx = np.random.randint(0,sequence_length)
-
-    sample_mask = sample.copy()
-    sample_mask[mask_idx] = 14
-
-    idx = [i for i in range(0,sequence_length)]
-    non_mask_idx = [i for i in idx if (i != mask_idx)]
-
-    label  = sample.copy()
-
-    label[non_mask_idx] = ignore_index
+    sample,masked,label = masking(sample)
 
     sample_all = np.full(3*sequence_length, pad_index, dtype=int)
     sample_all[0:len(sample)] = sample
-    sample_all[sequence_length:2*sequence_length] = sample_mask
+    sample_all[sequence_length:2*sequence_length] = masked
     sample_all[2*sequence_length:3*sequence_length] = label
 
     return sample_all

From 4fd136f98d85bd773e2dd2df0ac3028cabbc55cd Mon Sep 17 00:00:00 2001
From: Tuan Nguyen Anh Tran <tran71@lassen708.coral.llnl.gov>
Date: Tue, 26 Jul 2022 16:35:12 -0700
Subject: [PATCH 4/6] Fix weights initialization. Update datareader.

---
 applications/nlp/Roberta_atom/dataset.py | 64 ++++++++++++++++++------
 python/lbann/modules/base.py             | 21 ++++----
 2 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/applications/nlp/Roberta_atom/dataset.py b/applications/nlp/Roberta_atom/dataset.py
index 483ff6bc3bf..71b27997196 100644
--- a/applications/nlp/Roberta_atom/dataset.py
+++ b/applications/nlp/Roberta_atom/dataset.py
@@ -1,22 +1,22 @@
 import numpy as np
 
-bos_index = 0
-eos_index = 2
-pad_index = 1
+bos_index = 12
+eos_index = 13
+pad_index = 0
 ignore_index = -100
-mask_index = 4
+mask_index = 14
 mask_percent = 0.15
 
-sequence_length = 48
-vocab_length = 767
-samples = np.load("/g/g92/tran71/tran71/lbann_new/applications/nlp/Roberta_zinc_base/zinc250k.npy", allow_pickle=True) 
+sequence_length = 57
+vocab_length = 600
+samples = np.load("/g/g92/tran71/ChemBERTa-77M-MLM_smile2M.npy", allow_pickle=True) 
+
 
 train_samples = samples[:int(samples.size*0.8)]
 val_samples = samples[int(samples.size*0.8):int(samples.size*0.9)]
 test_samples = samples[int(samples.size*0.9):]
 
 
-
 # Masking samples
 '''
 https://github.com/huggingface/transformers/blob/v4.20.1/src/transformers/data/data_collator.py#L805
@@ -83,23 +83,56 @@ def get_val_sample(index):
 
     return sample_all
 
-
-# Test sample access functions
+# Testing sample access functions
 def get_test_sample(index):
     sample = test_samples[index]
+    rand_size = np.random.randint(1, 4)
+
+    if len(sample) < sequence_length:
+        mask_idx = sorted(np.random.randint(1,len(sample)-1,size=rand_size))
+    else:
+        mask_idx = sorted(np.random.randint(1,sequence_length-1,size=rand_size))
+
+
+    sample_mask = sample.copy()
+
+    op = np.random.randint(0, 3)
+
+    if(op == 0): # Delete augmentation
+        for idx in mask_idx:
+            if(idx < len(sample_mask)-2): # Don't mask the last character for deletion.          
+                sample_mask[idx] = mask_index   
+
+        del_count = 0            
+        for idx in mask_idx:
+            if(idx < len(sample_mask)-1): # Don't delete EoS.
+                sample_mask = np.delete(sample_mask, idx+1+del_count)
+                del_count -= 1           
+    elif(op == 1): # Insert augmentation
+        for idx in mask_idx:
+            sample_mask = np.insert(sample_mask, idx, mask_index)     
+    else: # Replace augmentation
+        for idx in mask_idx:
+            sample_mask[idx] = mask_index    
+            
     if len(sample) < sequence_length:
         sample = np.concatenate((sample, np.full(sequence_length-len(sample), pad_index)))
+
     else:
         sample = np.resize(sample, sequence_length)
 
-    sample,masked,label = masking(sample)
+    if len(sample_mask) < sequence_length:
+        sample_mask = np.concatenate((sample_mask, np.full(sequence_length-len(sample_mask), pad_index)))
+
+    else:
+        sample_mask = np.resize(sample_mask, sequence_length)
 
     sample_all = np.full(3*sequence_length, pad_index, dtype=int)
     sample_all[0:len(sample)] = sample
-    sample_all[sequence_length:2*sequence_length] = masked
-    sample_all[2*sequence_length:3*sequence_length] = label
+    sample_all[sequence_length:2*sequence_length] = sample_mask
+    sample_all[2*sequence_length:3*sequence_length] = sample
 
-    return sample_all
+    return sample_all 
 
 def num_train_samples():
     return train_samples.shape[0]
@@ -114,6 +147,5 @@ def sample_dims():
     return (3*sequence_length+1,)
 
 def vocab_size():
-    return 767
-
+    return vocab_length
 
diff --git a/python/lbann/modules/base.py b/python/lbann/modules/base.py
index 4338c9524bb..92bae3a28e9 100644
--- a/python/lbann/modules/base.py
+++ b/python/lbann/modules/base.py
@@ -64,8 +64,8 @@ def __init__(self,
             weights (`Weights` or iterator of `Weights`): Weights in
                 fully-connected layer. There are at most two: the
                 matrix and the bias. If weights are not provided, the
-                matrix will be initialized with Lecun uniform
-                initialization and the bias with Lecun uniform.
+                matrix will be initialized with He normal
+                initialization and the bias with zeros.
             name (str): Default name is in the form 'fcmodule<index>'.
             data_layout (str): Data layout.
             parallel_strategy (dict): Data partitioning scheme.
@@ -86,8 +86,8 @@ def __init__(self,
 
         # Initialize weights
         # Note: If weights are not provided, matrix weights are
-        # initialized with Lecun uniform scheme and bias weights are
-        # initialized with Lecun uniform scheme.
+        # initialized with He normal scheme and bias weights are
+        # initialized with zeros.
         self.weights = list(make_iterable(weights))
         if len(self.weights) > 2:
             raise ValueError('`FullyConnectedModule` has '
@@ -95,11 +95,11 @@ def __init__(self,
                              'but got {0}'.format(len(self.weights)))
         if len(self.weights) == 0:
             self.weights.append(
-                 lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
+                lbann.Weights(initializer=lbann.HeNormalInitializer(),
                               name=self.name+'_matrix'))
         if self.bias and len(self.weights) == 1:
             self.weights.append(
-                lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
+                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
                               name=self.name+'_bias'))
 
         # Initialize activation layer
@@ -158,9 +158,8 @@ def __init__(self,
         weights (`Weights` or iterator of `Weights`): Weights in
                 fully-connected layer. There are at most two: the
                 matrix and the bias. If weights are not provided, the
-                matrix will be initialized with Lecun uniform 
-                initialization and the bias Lecun uniform 
-                initialization.
+                matrix will be initialized with He normal
+                initialization and the bias with zeros.
         activation (type): Layer class for activation function.
         name (str): Default name is in the form 'channelwisefc<index>'.
         parallel_strategy (dict): Data partitioning scheme.
@@ -184,11 +183,11 @@ def __init__(self,
                          'but got {0}'.format(len(self.weights)))
     if len(self.weights) == 0:
         self.weights.append(
-             lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
+            lbann.Weights(initializer=lbann.HeNormalInitializer(),
                           name=self.name+'_matrix'))
     if self.bias and len(self.weights) == 1:
         self.weights.append(
-            lbann.Weights(initializer=lbann.LeCunUniformInitializer(),
+            lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
                           name=self.name+'_bias'))
     self.activation = None
     if activation:

From e7a922c0f1658db2ef2743aa6253479cd59b3fcf Mon Sep 17 00:00:00 2001
From: Tuan Nguyen Anh Tran <tran71@lassen708.coral.llnl.gov>
Date: Wed, 27 Jul 2022 11:34:21 -0700
Subject: [PATCH 5/6] Add evaluation script.

---
 applications/nlp/Roberta_atom/evaluation.py | 290 ++++++++++++++++++++
 1 file changed, 290 insertions(+)
 create mode 100644 applications/nlp/Roberta_atom/evaluation.py

diff --git a/applications/nlp/Roberta_atom/evaluation.py b/applications/nlp/Roberta_atom/evaluation.py
new file mode 100644
index 00000000000..9c827cd650b
--- /dev/null
+++ b/applications/nlp/Roberta_atom/evaluation.py
@@ -0,0 +1,290 @@
+import numpy as np
+import pandas as pd
+import sys
+import glob
+import torch
+import itertools
+
+from rdkit import Chem
+from rdkit import DataStructs
+from rdkit.Chem import AllChem
+from rdkit import RDLogger
+
+
+seq_length = 57
+vocab_length = 600
+  
+def detokenize(inp,vocab):
+  output = ""
+  for i in inp:
+    token = list(vocab.keys())[list(vocab.values()).index(int(i))]
+
+    if(token =='[SEP]'):
+      break	
+    if(token !='[CLS]' and token !='[PAD]'):
+      output = output+token
+
+  return output
+
+def softmax(x):
+    """Compute softmax values for each sets of scores in x."""
+    return np.exp(x) / np.sum(np.exp(x), axis=0)
+
+
+def read_csv(fdir):
+
+	input_files = glob.glob(fdir+"*_output0.csv")
+
+	if(len(input_files)> 2):
+		ins = np.concatenate((np.loadtxt(f,delimiter=",")) for f in input_files)
+	else:
+		ins = np.loadtxt(input_files[0], delimiter=",")
+
+	return ins
+
+def get_masked_index(array,mask_value):
+	mask_index = []
+	for i in range(array.shape[0]):
+		index = np.where((array[i]==mask_value)==True)[0]
+		mask_index.append(index)
+
+	return mask_index 
+
+def process_output_topk(input_mask,output,mask_index, k=5):
+    num_samples = input_mask.shape[0]
+    process_output = []
+    for j in range(num_samples):
+        all_preds = []
+        for i in mask_index[j]:
+            preds = torch.from_numpy(output[j,vocab_length*i:vocab_length*(i+1)])
+            _,preds = torch.topk(preds, k)
+            all_preds.append(preds.numpy())
+ 
+        replace_char = list(itertools.product(*all_preds))
+
+        for chars in replace_char:
+            preds_sm = input_mask[j].copy()
+            for (idx,val) in zip(mask_index[j], chars):
+                preds_sm[idx] = val
+            process_output.append(preds_sm)
+
+    return process_output 
+
+def process_input_topk(input,mask_index, k=5):
+    process_input = []
+    num_samples = input.shape[0]
+    for i in range(num_samples):
+        for j in range(k**len(mask_index[i])):
+            process_input.append(input[i])
+         
+    return process_input
+
+
+def get_smiles_from_lbann_tensors(fdir, vocab_path):
+
+
+  ###################
+  # First input files 
+  ###################
+ 
+  input_files = glob.glob(fdir+"inps.csv")
+
+  ins = np.loadtxt(input_files[0], delimiter=",")
+  for i, f in enumerate(input_files):
+    if(i > 0) :
+      ins = np.concatenate((ins, np.loadtxt(f,delimiter=",")))
+
+  num_cols = ins.shape[1]
+  num_samples = ins.shape[0]
+
+  vocab = pd.read_csv(vocab_file, delimiter=" ", header=None, quoting=3).to_dict()[0]
+  vocab = dict([(v,k) for k,v in vocab.items()])
+
+  samples = [detokenize(i_x,vocab) for i_x in ins[:,0:]] 
+
+
+  samples = pd.DataFrame(samples, columns=['SMILES'])
+
+  samples.to_csv("gt_"+"smiles.txt", index=False)
+
+  ####################
+  # Second input files 
+  ####################
+
+  input_files = glob.glob(fdir+"preds.csv")
+
+  ins = np.loadtxt(input_files[0], delimiter=",")
+  for i, f in enumerate(input_files):
+    if(i > 0) :
+      ins = np.concatenate((ins, np.loadtxt(f,delimiter=",")))
+
+  num_cols = ins.shape[1]
+  num_samples = ins.shape[0]
+
+  vocab = pd.read_csv(vocab_file, delimiter=" ", header=None, quoting=3).to_dict()[0]
+  vocab = dict([(v,k) for k,v in vocab.items()])
+
+  samples = [detokenize(i_x,vocab) for i_x in ins[:,0:]] 
+
+  samples = pd.DataFrame(samples, columns=['SMILES'])
+
+  samples.to_csv("pred_"+"smiles.txt", index=False)
+          
+def compare_decoded_to_original_smiles(orig_smiles, decoded_smiles, output_file=None):
+    """
+    Compare decoded to original SMILES strings and output a table of Tanimoto distances, along with
+    binary flags for whether the strings are the same and whether the decoded string is valid SMILES.
+    orig_smiles and decoded_smiles are lists or arrays of strings.
+    If an output file name is provided, the table will be written to it as a CSV file.
+    Returns the table as a DataFrame.
+
+    """
+    res_df = pd.DataFrame(dict(original=orig_smiles, decoded=decoded_smiles))
+    is_valid = []
+    is_same = []
+    tani_dist = []
+    accuracy = []
+    count = 0
+    data_size = len(orig_smiles)
+    for row in res_df.itertuples():
+        count = count + 1
+        #compute char by char accuracy
+        hit = 0
+        for x, y in zip(row.original, row.decoded):
+            if x == y:
+                hit = hit+1
+        accuracy.append((hit/len(row.original))*100)
+
+        is_same.append(int(row.decoded == row.original))
+        orig_mol = Chem.MolFromSmiles(row.original)
+        if orig_mol is None:
+          #Note, input may be invalid, if original SMILE string is truncated 
+          is_valid.append('x')
+          tani_dist.append(-1)
+          continue
+        dec_mol = Chem.MolFromSmiles(row.decoded)
+        RDLogger.DisableLog('rdApp.*')
+        if dec_mol is None:
+            is_valid.append(0)
+            tani_dist.append(1)
+        else:
+            is_valid.append(1)
+            orig_fp = AllChem.GetMorganFingerprintAsBitVect(orig_mol, 2, 1024)
+            dec_fp = AllChem.GetMorganFingerprintAsBitVect(dec_mol, 2, 1024)
+            tani_sim = DataStructs.FingerprintSimilarity(orig_fp, dec_fp, metric=DataStructs.TanimotoSimilarity)
+            tani_dist.append(1.0 - tani_sim)
+    res_df['is_valid'] = is_valid
+    res_df['is_same'] = is_same
+    res_df['smile_accuracy'] = accuracy
+    res_df['tanimoto_distance'] = tani_dist
+    global_acc  = np.mean(np.array(accuracy))
+    res_df['total_avg_accuracy'] = [global_acc]*len(accuracy)
+    
+    #print("Mean global accuracy % ", global_acc)
+    #print("Validity % ", (is_valid.count(1)/data_size)*100)
+    #print("Same % ", (is_same.count(1)/data_size)*100)
+    valid_tani_dist = [ t for t in tani_dist if t >= 0 ] 
+    #print("Average tanimoto ", np.mean(np.array(valid_tani_dist)))
+    
+
+    if output_file is not None:
+        output_columns = ['original', 'decoded', 'is_valid', 'is_same', 'smile_accuracy','tanimoto_distance','total_avg_accuracy']
+        res_df.to_csv(output_file, index=False, columns=output_columns)
+    return(res_df)
+
+def read_smiles_csv(path):
+    return pd.read_csv(path,usecols=['SMILES'],
+                       squeeze=True).astype(str).tolist()
+
+def get_valid_single_string(ins):
+    for i in range(len(ins)):
+        if(ins[i,2] == 'x'):    
+            return ins[i]
+        elif(ins[i,2] == '1'):     
+            return ins[i]        
+        
+    return ins[0]
+
+def get_valid(ins,mask_index,k=5):
+    process_ins = []
+    num_samples = len(mask_index)
+    min_index = 0
+    for i in range(num_samples):
+        max_index = min_index + k**len(mask_index[i])
+        string = get_valid_single_string(ins[min_index:max_index])
+        process_ins.append(string)
+        min_index = max_index       
+  
+    return process_ins
+
+def get_metric(ins):
+ 
+    valid = ins[:,2].flatten()
+    same = ins[:,3].flatten()
+
+    total_acc = 0
+    total_same = 0
+    x_value = 0
+
+    for i in range(len(valid)):
+        if(valid[i] == '1'):
+            total_acc += 1
+        elif(valid[i] == 'x'):   
+            x_value +=1        
+        if(same[i] == 1):
+            total_same += 1                
+        
+    return total_acc/(len(valid)-x_value), total_same/(len(same)-x_value) 
+
+# Files directory
+fdir_ouput='output/'
+fdir_input='input/'
+fdir_mask='mask/'
+fdir=''
+vocab_file= 'vocab_600.txt'
+
+
+# Get masked smile
+mask = 14 
+input_masked_smile = read_csv(fdir_mask)
+mask_index = get_masked_index(input_masked_smile,mask)
+
+# Save input
+input_smile = read_csv(fdir_input)
+processed_input = process_input_topk(input_smile,mask_index)
+np.savetxt('inps.csv', processed_input, delimiter=',', fmt ='% s')
+
+# Save output
+output = read_csv(fdir_ouput)
+processed_output = process_output_topk(input_masked_smile,output,mask_index)
+np.savetxt('preds.csv', processed_output, delimiter=',', fmt ='% s')
+
+# Calculate validity
+get_smiles_from_lbann_tensors(fdir, vocab_file)
+
+orig_file = read_smiles_csv("gt_smiles.txt")
+pred_file = read_smiles_csv("pred_smiles.txt")
+diff_file = "sd"+"_smiles_metrics.csv"
+final_file = "smile.csv"
+
+print("Input/pred SMILES file sizes ", len(orig_file), " ", len(pred_file))
+
+compare_decoded_to_original_smiles(orig_file, pred_file, diff_file)
+
+ins = pd.read_csv(diff_file,delimiter=",")
+ins = ins.to_numpy()
+
+process_ins = get_valid(ins,mask_index)
+print("Input/pred SMILES diff file saved to", diff_file)
+
+np.savetxt(final_file, process_ins, delimiter=',', fmt ='% s')
+ 
+ins = pd.read_csv(final_file,delimiter=",")
+ins = ins.to_numpy()
+
+valid,same = get_metric(ins)
+
+print("Validity % ", valid*100)
+print("Same % ", same*100)
+
+

From 6fc19016a7e36dc5659e447919c68d57af030c62 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen Anh Tran <tran71@lassen708.coral.llnl.gov>
Date: Wed, 27 Jul 2022 11:36:29 -0700
Subject: [PATCH 6/6] Update model config.

---
 applications/nlp/Roberta_atom/config.json | 26 +++++++++++++----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/applications/nlp/Roberta_atom/config.json b/applications/nlp/Roberta_atom/config.json
index 7d37f79227b..43d59cbaa38 100644
--- a/applications/nlp/Roberta_atom/config.json
+++ b/applications/nlp/Roberta_atom/config.json
@@ -2,21 +2,25 @@
   "architectures": [
     "RobertaForMaskedLM"
   ],
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
+  "attention_probs_dropout_prob": 0.109,
+  "bos_token_id": 12,
+  "eos_token_id": 13,
+  "gradient_checkpointing": false,
   "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
+  "hidden_dropout_prob": 0.144,
+  "hidden_size": 384,
   "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 514,
+  "intermediate_size": 464,
+  "is_gpu": true,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 515,
   "model_type": "roberta",
   "num_attention_heads": 12,
-  "num_hidden_layers": 6,
-  "output_past": true,
+  "num_hidden_layers": 3,
   "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.6.1",
   "type_vocab_size": 1,
-  "vocab_size": 767
+  "use_cache": true,
+  "vocab_size": 600
 }