From 485d317b98386fe709e53acbbca1718e8909d0f2 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Tue, 22 Oct 2019 22:08:51 +0800 Subject: [PATCH 1/6] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f6812457..d81ee4a1 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ python train.py -task ext -mode train -bert_data_path BERT_DATA_PATH -ext_dropou #### TransformerAbs (baseline) ``` -python train.py -mode train -accum_count 5 -batch_size 300 -bert_data_path BERT_DATA_PATH -dec_dropout 0.1 -log_file ../../logs/cnndm_baseline -lr 0.1 -model_path MODEL_PATH -save_checkpoint_steps 2000 -seed 777 -sep_optim false -train_steps 200000 -use_bert_emb true -use_interval true -warmup_steps 8000 -visible_gpus 0,1,2,3 -max_pos 512 -report_every 50 -enc_hidden_size 512 -enc_layers 6 -enc_ff_size 2048 -enc_dropout 0.1 -dec_layers 6 -dec_hidden_size 512 -dec_ff_size 2048 -encoder baseline -task abs +python train.py -mode train -accum_count 5 -batch_size 300 -bert_data_path BERT_DATA_PATH -dec_dropout 0.1 -log_file ../../logs/cnndm_baseline -lr 0.05 -model_path MODEL_PATH -save_checkpoint_steps 2000 -seed 777 -sep_optim false -train_steps 200000 -use_bert_emb true -use_interval true -warmup_steps 8000 -visible_gpus 0,1,2,3 -max_pos 512 -report_every 50 -enc_hidden_size 512 -enc_layers 6 -enc_ff_size 2048 -enc_dropout 0.1 -dec_layers 6 -dec_hidden_size 512 -dec_ff_size 2048 -encoder baseline -task abs ``` #### BertAbs ``` From ba17e95de8cde9d5ddaeeba01df7cace584511b2 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Wed, 23 Oct 2019 10:21:41 +0800 Subject: [PATCH 2/6] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d81ee4a1..da91e652 100644 --- a/README.md +++ b/README.md @@ -149,11 +149,15 @@ python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropo ## Model Evaluation +### CNN/DM ``` python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -max_length 200 -alpha 0.95 -min_length 50 -result_path ../logs/abs_bert_cnndm ``` +### XSum +``` + python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -min_length 20 -max_length 100 -alpha 0.9 -result_path ../logs/abs_bert_cnndm +``` * `-mode` can be {`validate, test`}, where `validate` will inspect the model directory and evaluate the model for each newly saved checkpoint, `test` need to be used with `-test_from`, indicating the checkpoint you want to use * `MODEL_PATH` is the directory of saved checkpoints * use `-mode valiadte` with `-test_all`, the system will load all saved checkpoints and select the top ones to generate summaries (this will take a while) - From ce8dc017fbef7c12b1b4bd764f0c3d20911ead5e Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Tue, 19 Nov 2019 22:25:05 +0000 Subject: [PATCH 3/6] trained baseline model for CNNDM, system outputs add trained baseline model for CNNDM, add system outputs --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index da91e652..2db0ec98 100644 --- a/README.md +++ b/README.md @@ -66,11 +66,17 @@ Results on CNN/DailyMail (20/8/2019): Some codes are borrowed from ONMT(https://github.com/OpenNMT/OpenNMT-py) ## Trained Models -[CNN/DM Extractive](https://drive.google.com/open?id=1kKWoV0QCbeIuFt85beQgJ4v0lujaXobJ) +[CNN/DM BertExt](https://drive.google.com/open?id=1kKWoV0QCbeIuFt85beQgJ4v0lujaXobJ) -[CNN/DM Abstractive](https://drive.google.com/open?id=1-IKVCtc4Q-BdZpjXc4s70_fRsWnjtYLr) +[CNN/DM BertExtAbs](https://drive.google.com/open?id=1-IKVCtc4Q-BdZpjXc4s70_fRsWnjtYLr) -[XSum](https://drive.google.com/open?id=1H50fClyTkNprWJNh10HWdGEdDdQIkzsI) +[CNN/DM TransformerAbs](https://drive.google.com/open?id=1yLCqT__ilQ3mf5YUUCw9-UToesX5Roxy) + +[XSum BertExtAbs](https://drive.google.com/open?id=1H50fClyTkNprWJNh10HWdGEdDdQIkzsI) + +## System Outputs + +[CNN/DM and XSum](https://drive.google.com/file/d/1kYA384UEAQkvmZ-yWZAfxw7htCbCwFzC) ## Data Preparation For XSum [Pre-processed data](https://drive.google.com/open?id=1BWBN1coTWGBqrWoOfRc5dhojPHhatbYs) From 70d7a324923e7d0e229af592013bf3b60ccc82ef Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Wed, 22 Jan 2020 19:13:54 +0000 Subject: [PATCH 4/6] update Readme for summarizing raw text input --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 2db0ec98..a40ee68b 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,15 @@ **This code is for EMNLP 2019 paper [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345)** +**Updates Jan 22 2020**: Now you can **Summarize Raw Text Input!**. Swith to the dev branch, and use `-text_src $RAW_SRC.TXT` to input your text file. +* use `-test_from $PT_FILE$` to use your model checkpoint file. +* Format of the source text file: + * For **abstractive summarization**, each line is a document. + * If you want to do **extractive summarization**, please insert ` [CLS] [SEP] ` as your sentence boundaries. +* There are example input files in the [raw_data directory](https://github.com/nlpyang/PreSumm/tree/dev/raw_data) +* If you also have reference summaries aligned with your source input, please use `-text_tgt $RAW_TGT.TXT` to keep the order for evaluation. + + Results on CNN/DailyMail (20/8/2019): @@ -60,6 +69,8 @@ Results on CNN/DailyMail (20/8/2019): **Package Requirements**: torch==1.1.0 pytorch_transformers tensorboardX multiprocess pyrouge + + **Updates**: For encoding a text longer than 512 tokens, for example 800. Set max_pos to 800 during both preprocessing and training. From e61cad64e3651047e35aeb8058e0dce7fdd6bd02 Mon Sep 17 00:00:00 2001 From: Alcamech Date: Thu, 23 Jan 2020 10:30:49 -0500 Subject: [PATCH 5/6] updated README, setup test_text_ext and test_text_abs --- README.md | 18 +++++++++++++++--- src/train.py | 22 +++++++++++++++++----- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f6812457..1afa73c2 100644 --- a/README.md +++ b/README.md @@ -145,9 +145,6 @@ python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropo ``` * `EXT_CKPT` is the saved `.pt` checkpoint of the extractive model. - - - ## Model Evaluation ``` python train.py -task abs -mode validate -batch_size 3000 -test_batch_size 500 -bert_data_path BERT_DATA_PATH -log_file ../logs/val_abs_bert_cnndm -model_path MODEL_PATH -sep_optim true -use_interval true -visible_gpus 1 -max_pos 512 -max_length 200 -alpha 0.95 -min_length 50 -result_path ../logs/abs_bert_cnndm @@ -156,4 +153,19 @@ python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropo * `MODEL_PATH` is the directory of saved checkpoints * use `-mode valiadte` with `-test_all`, the system will load all saved checkpoints and select the top ones to generate summaries (this will take a while) +## Raw Text Input + +### Abstractive Summarization + +``` + python train.py -task abs -mode test_text -visible_gpus 0 -test_from PATH_TO_CHECKPOINT -text_src PATH_TO_SRC -text_tgt PATH_TO_TGT -log_file ../logs/abs_bert_cnndm +``` +### Extractive Summarization + +``` + python train.py -task ext -mode test_text -visible_gpus 0 -test_from PATH_TO_CHECKPOINT -text_src PATH_TO_SRC -text_tgt PATH_TO_TGT -log_file ../logs/abs_bert_cnndm +``` + + + diff --git a/src/train.py b/src/train.py index c941269a..a3ee1487 100644 --- a/src/train.py +++ b/src/train.py @@ -5,6 +5,7 @@ from __future__ import division import argparse +import torch import os from others.logging import init_logger from train_abstractive import validate_abs, train_abs, baseline, test_abs, test_text_abs @@ -30,16 +31,15 @@ def str2bool(v): parser.add_argument("-task", default='ext', type=str, choices=['ext', 'abs']) parser.add_argument("-encoder", default='bert', type=str, choices=['bert', 'baseline']) parser.add_argument("-mode", default='train', type=str, choices=['train', 'validate', 'test', 'test_text']) - parser.add_argument("-bert_data_path", default='../bert_data_new/cnndm') + parser.add_argument("-text_src", default="../raw_dada/temp.raw_src") + parser.add_argument("-text_tgt", default="../raw_dada/temp.raw_tgt") + parser.add_argument("-bert_data_path", default='../bert_data/cnndm') parser.add_argument("-model_path", default='../models/') parser.add_argument("-result_path", default='../results/cnndm') parser.add_argument("-temp_dir", default='../temp') - parser.add_argument("-text_src", default='') - parser.add_argument("-text_tgt", default='') parser.add_argument("-batch_size", default=140, type=int) parser.add_argument("-test_batch_size", default=200, type=int) - parser.add_argument("-max_ndocs_in_batch", default=6, type=int) parser.add_argument("-max_pos", default=512, type=int) parser.add_argument("-use_interval", type=str2bool, nargs='?',const=True,default=True) @@ -120,6 +120,8 @@ def str2bool(v): device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 + torch.cuda.empty_cache() + if (args.task == 'abs'): if (args.mode == 'train'): train_abs(args, device_id) @@ -137,6 +139,11 @@ def str2bool(v): step = 0 test_abs(args, device_id, cp, step) elif (args.mode == 'test_text'): + cp = args.test_from + try: + step = int(cp.split('.')[-2].split('_')[-1]) + except: + step = 0 test_text_abs(args) elif (args.task == 'ext'): @@ -152,4 +159,9 @@ def str2bool(v): step = 0 test_ext(args, device_id, cp, step) elif (args.mode == 'test_text'): - test_text_ext(args) + cp = args.test_from + try: + step = int(cp.split('.')[-2].split('_')[-1]) + except: + step = 0 + test_text_ext(args) \ No newline at end of file From 50051f8f4dfb11da9e9b7d220d1546aaad99a92f Mon Sep 17 00:00:00 2001 From: alcamech Date: Tue, 4 Feb 2020 10:12:02 -0500 Subject: [PATCH 6/6] string input feature --- README.md | 8 +++++++- src/models/data_loader.py | 25 +++++++++++++++++++++++-- src/train.py | 5 +++-- src/train_abstractive.py | 2 -- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1afa73c2..573cc111 100644 --- a/README.md +++ b/README.md @@ -156,10 +156,16 @@ python train.py -task abs -mode train -bert_data_path BERT_DATA_PATH -dec_dropo ## Raw Text Input ### Abstractive Summarization - + +Source File ``` python train.py -task abs -mode test_text -visible_gpus 0 -test_from PATH_TO_CHECKPOINT -text_src PATH_TO_SRC -text_tgt PATH_TO_TGT -log_file ../logs/abs_bert_cnndm ``` + +String Input +```python +python train.py -task abs -mode test_text -visible_gpus 0 -test_from ../models/model_step_148000.pt -text_src 'this is a string test' -input_type str -log_file ../logs/abs_bert_cnndm +``` ### Extractive Summarization ``` diff --git a/src/models/data_loader.py b/src/models/data_loader.py index 813e3b86..5165c49b 100644 --- a/src/models/data_loader.py +++ b/src/models/data_loader.py @@ -2,6 +2,7 @@ import gc import glob import random +import os.path import torch from tqdm import tqdm @@ -295,7 +296,11 @@ def load_text(args, source_fp, target_fp, device): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) sep_vid = tokenizer.vocab['[SEP]'] cls_vid = tokenizer.vocab['[CLS]'] - n_lines = len(open(source_fp).read().split('\n')) + + if os.path.isfile(source_fp): + n_lines = len(open(source_fp).read().split('\n')) + else: + n_lines = 1 def _process_src(raw): raw = raw.strip().lower() @@ -323,7 +328,7 @@ def _process_src(raw): return src, mask_src, segments_ids, clss, mask_cls - if(target_fp==''): + if(target_fp=='' and args.input_type == 'doc'): with open(source_fp) as source: for x in tqdm(source, total=n_lines): src, mask_src, segments_ids, clss, mask_cls = _process_src(x) @@ -341,6 +346,22 @@ def _process_src(raw): batch.batch_size=1 yield batch + elif(args.input_type == 'str'): + src, mask_src, segments_ids, clss, mask_cls = _process_src(source_fp) + segs = torch.tensor(segments_ids)[None, :].to(device) + batch = Batch() + batch.src = src + batch.tgt = None + batch.mask_src = mask_src + batch.mask_tgt = None + batch.segs = segs + batch.src_str = [[sent.replace('[SEP]','').strip() for sent in source_fp.split('[CLS]')]] + batch.tgt_str = [''] + batch.clss = clss + batch.mask_cls = mask_cls + + batch.batch_size=1 + yield batch else: with open(source_fp) as source, open(target_fp) as target: for x, y in tqdm(zip(source, target), total=n_lines): diff --git a/src/train.py b/src/train.py index a3ee1487..8fe97ffd 100644 --- a/src/train.py +++ b/src/train.py @@ -31,8 +31,9 @@ def str2bool(v): parser.add_argument("-task", default='ext', type=str, choices=['ext', 'abs']) parser.add_argument("-encoder", default='bert', type=str, choices=['bert', 'baseline']) parser.add_argument("-mode", default='train', type=str, choices=['train', 'validate', 'test', 'test_text']) - parser.add_argument("-text_src", default="../raw_dada/temp.raw_src") - parser.add_argument("-text_tgt", default="../raw_dada/temp.raw_tgt") + parser.add_argument("-text_src", default='../raw_data/temp.raw_src') + parser.add_argument("-input_type",default='doc',type=str, choices=['doc','str']) + parser.add_argument("-text_tgt", default='') parser.add_argument("-bert_data_path", default='../bert_data/cnndm') parser.add_argument("-model_path", default='../models/') parser.add_argument("-result_path", default='../results/cnndm') diff --git a/src/train_abstractive.py b/src/train_abstractive.py index fd16ac45..8f4b2f42 100644 --- a/src/train_abstractive.py +++ b/src/train_abstractive.py @@ -307,8 +307,6 @@ def train_iter_fct(): trainer.train(train_iter_fct, args.train_steps) - - def test_text_abs(args): logger.info('Loading checkpoint from %s' % args.test_from)