From 01de7bf2ae315872e4ae6b9ca2fae5e4a420afa6 Mon Sep 17 00:00:00 2001 From: Shivam Sharma <15UCC035@lnmiit.ac.in> Date: Sat, 23 Dec 2017 19:03:57 +0530 Subject: [PATCH 1/2] Updates 23/12/2017 --- lexdecomp/compmodel.py | 8 +++++--- lexdecomp/compmodel.pyc | Bin 0 -> 4026 bytes lexdecomp/train.py | 39 ++++++++++++++++++++++++--------------- tools/text2numpy.py | 1 + tools/word2vec2text.py | 20 ++++++++++++-------- trec-qa/code.py | 7 +++++++ 6 files changed, 49 insertions(+), 26 deletions(-) create mode 100644 lexdecomp/compmodel.pyc create mode 100644 trec-qa/code.py diff --git a/lexdecomp/compmodel.py b/lexdecomp/compmodel.py index 738120d..453a084 100644 --- a/lexdecomp/compmodel.py +++ b/lexdecomp/compmodel.py @@ -35,7 +35,7 @@ def conv_layer(in_data): embedding_size = in_data.shape[1] sequence_length = in_data.shape[2] in_channels = in_data.shape[3] - elif hasattr(in_data, 'get_shape'): # a TensorFlow placeholder + if hasattr(in_data, 'get_shape'): # a TensorFlow placeholder embedding_size = in_data.get_shape()[1].value sequence_length = in_data.get_shape()[2].value in_channels = in_data.get_shape()[3].value @@ -58,7 +58,8 @@ def conv_layer(in_data): pooled_outputs.append(pooled) # concatenating feature maps - features = tf.concat(3, pooled_outputs) + features = tf.concat(pooled_outputs,axis = 3) +# features = [pooled_outputs[0], pooled_outputs[1], pooled_outputs[2]] total_feature_maps = len(FILTER_SIZES) * NUM_FEATURE_MAPS features_flat = tf.reshape(features, [-1, total_feature_maps]) # features_flat.shape = [batch, total_feature_maps] @@ -88,7 +89,8 @@ def inference(questions, sentences, keep_prob): """ question_features = conv_layer(questions) sentence_features = conv_layer(sentences) - features = tf.concat(1, [question_features, sentence_features]) + features = tf.concat([question_features, sentence_features],axis=1) +# features = [1, question_features, sentence_features] scores = hidden_layer(features, keep_prob) return scores diff --git a/lexdecomp/compmodel.pyc b/lexdecomp/compmodel.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d32a0f11e15c1795f10b20302e82cc15fc231020 GIT binary patch literal 4026 zcmaJ^-EJGl6`oyElt@uhtT?S)x9KK8Q5ul-;}i}e1Z`wVP6gO(u~ckSsj(h%hvX{z zmz`PJD#)8EFVO-;(Z}drFMH7!=ySBaPQP=8OUZR3NjsXEJ#)^P^YfjvwSTR&p8w|u zzZr55EjtE^5JL1$8VNHZ}5iW>PO`Kxh5T{r!if=%95%rb!%vgC^rADO#EcKEs zRPk1;bpisOdy)2{B#%`>(J!Orf(d#x|W z#^ic$ds`N-qw}q-7;VeG-rjw<^I&V|9`$dT5p>| zMri~oxim%ZgSDqwF|A*F3oo7XM(5Zbp3G@mYrNnK19#E^VS z67`QsY@wuqMY&2i2 zA4#or2fKm?O(!UvZ5ZblhFFqe0xNnQOWndC+;!)D{I;C|Id%;XZL<^qj2*ZDPK2i} zT~T~O^E39^ohf^l7Bz8a>Q-;+Sky%NygCS%;ovv?+7*{z<%&CS|LnNk0tec%Ozal; z&=y#=g+V+t;Ck_UM~no8%Im)ixa{!7-vpuo7CA+^UW&i^-ChERyOJn&1G{7Q_Y)e}6isYKsXFhOq@N zCOjdo-zCSXO~JS>&Q`=^nHW1?|4BS2EmsNW@CNdxxb7NV4(}0Nx2G$t)GaY-6Rb^E z#0W}(ZdFWH5&z_9{Z9=0c^+RN`ZbUz8~)!U&|Y?~3SdND8ZVSa0(3n5?B%kM3HAfp z+yfKfAuxpaXo@%Q%f2*&@uqk5^wH7b;c@r#pm%(Fw8^JmYV~zV*c8Mn9hvdVP4BoH z9Q<;>+dX>HdtoIM%9KT_Z{lq-BjaF*EG`QbBr@0btj~bM!yI)?6E_|y6R>*vGpuIX zdr7H{*H@m3fb9kD(k+u5hA9ep;o(lwy9R!FEC*_w#i1%VQMD-|U^&lo$r;Je@NECd z@gdK?qMT(U>`)Fde-wI-~(I$CCF?&Yw%eh4A(Q_Qy~@L15if$mInJk91tM+0dx!! zz#@VGKpge~LqMD*=8R21ndJtQSt6AQzM4P^a-IGOhO5dxEaUj%K8%;5=%agJ3vO}7 zUX^(tN5s5l+92mrNDRMHDi87^>)Vu|pzu8!HfW##MJ8tUJ{z|j~M zS&jhne@Gidlt`15vHw9uptGSt%y-B^{yiG*(<=&6s?rBf_Itgf9)}HsTac=?vNV(CJj8JR?7bvLx*%^_yW3XYNj-=s%}|ZAKU^}MNomaHndw2 z&UdZnen!0Ud{?7FQ8jOOpJ8F!lvP1%Rc(CH_K+65y_rs zW5vr06-Jy*{R?JczaH*aHFQ=GZOOI#_B{1EuQHGXc>hDqU9R;HC5^vBH>PBOtP+J+ zmk9y_pr4J{pB&p(FFR$m8G0*84Bd3s8+_qE0?(ItG$o~2Tz7tCyF@ZQZE^ULkLKp5 zNQI?c;aK~_@|Wn?(96IJ_-3m>5?~0;E*+PeJ~$4b2e^fssN-@f50Q*6YY|~)%S|d1 z#R)o!vpyOIZB)*I$sa}3ly?|iNJmB&7$=d%#Vp@tEDJ8rOwN$8AIjucxrq`~_JWa$ z2_ECSN!fCXk1cv>bA^n;0)Zj3Gl#TAQkiLxr>P=0 UoOXT9Y1Ep{=Dp^6v(;?>7hWBQt^fc4 literal 0 HcmV?d00001 diff --git a/lexdecomp/train.py b/lexdecomp/train.py index a8f4a77..b379343 100644 --- a/lexdecomp/train.py +++ b/lexdecomp/train.py @@ -78,12 +78,15 @@ def run_training(training_data, dev_data, test_data, model_dir): [None, EMBEDDING_SIZE, max_sentence, IN_CHANNELS]) labels = tf.placeholder(tf.float32, [None]) keep_prob = tf.placeholder(tf.float32, name='keep_prob') - + # building the graph + print('HERE!-0') logits = compmodel.inference(questions, sentences, keep_prob) + print('HERE!-1') loss = compmodel.loss(logits, labels) + print('HERE!-2') train_op = compmodel.training(loss) - + print('HERE!') saver = tf.train.Saver() bestdev_model_file = Path(model_dir, 'best-dev_model.ckpt').as_posix() @@ -154,27 +157,33 @@ def evaluate(session, dataset, data_label, model_label): def main(): - parser = argparse.ArgumentParser( - description='Trains the sentence composition model (CNN).') - parser.add_argument('training', help='training set (.hdf5)') - parser.add_argument('dev', help='dev set (.hdf5)') - parser.add_argument('test', help='test set (.hdf5)') - parser.add_argument('model_dir', help='directory to save models') - args = parser.parse_args() - +# parser = argparse.ArgumentParser( +# description='Trains the sentence composition model (CNN).') +# parser.add_argument('training', help='training set (.hdf5)') +# parser.add_argument('dev', help='dev set (.hdf5)') +# parser.add_argument('test', help='test set (.hdf5)') +# parser.add_argument('model_dir', help='directory to save models') +# args = parser.parse_args() +# args.append('../train-filtered.hdf5') +# args.append('../dev-filtered.hdf5') +# args.append('../test-filtered.hdf5') +# args.append('../saved-model') # checking model directory - model_dir = Path(args.model_dir) +# print ('Yes1') + model_dir = Path('../saved-model') if not model_dir.exists(): model_dir.mkdir() # data files - training_data = h5py.File(args.training) - dev_data = h5py.File(args.dev) - test_data = h5py.File(args.test) +# print ('Yes2') + training_data = h5py.File('../train-filtered.hdf5') + dev_data = h5py.File('../dev-filtered.hdf5') + test_data = h5py.File('../test-filtered.hdf5') try: +# print ('Yes3') run_training(training_data, dev_data, test_data, - args.model_dir) + '../saved-model') finally: training_data.close() dev_data.close() diff --git a/tools/text2numpy.py b/tools/text2numpy.py index 9eb7f52..ef22767 100644 --- a/tools/text2numpy.py +++ b/tools/text2numpy.py @@ -1,3 +1,4 @@ +import sys import argparse import re import os diff --git a/tools/word2vec2text.py b/tools/word2vec2text.py index 6ebbb4a..5a0a31c 100644 --- a/tools/word2vec2text.py +++ b/tools/word2vec2text.py @@ -56,12 +56,16 @@ def memorymap(filename): if end == -1: break wordbytes = mvec[pos:end] - word = wordbytes.decode('utf-8', errors='replace').strip() - # reading the corresponding vector - pos = end + 1 - end = pos + byte_offset - vector = array('f', mvec[pos:end]) - if vocabulary is not None and word not in vocabulary: - continue # skip word if not in vocabulary - print(word, ' '.join(map(str, vector)), file=fout) + try: + word = wordbytes.decode('utf-8', errors='replace').strip() + # reading the corresponding vector + pos = end + 1 + end = pos + byte_offset + vector = array('f', mvec[pos:end]) + if vocabulary is not None and word not in vocabulary: + continue # skip word if not in vocabulary + else: + print(word, ' '.join(map(str, vector)), file=fout) + except: + continue print('finished') diff --git a/trec-qa/code.py b/trec-qa/code.py new file mode 100644 index 0000000..fbbf933 --- /dev/null +++ b/trec-qa/code.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +f = open('train-filtered.tsv') + +lines = f.readlines() + +lines = [line.split('\t') for line in lines] \ No newline at end of file From eb02f2541f4a3907bc7d690b64d89d995c6a2e65 Mon Sep 17 00:00:00 2001 From: Shivam Sharma <15ucc035@lnmiit.ac.in> Date: Sat, 23 Dec 2017 19:42:48 +0530 Subject: [PATCH 2/2] Update compmodel.py --- lexdecomp/compmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lexdecomp/compmodel.py b/lexdecomp/compmodel.py index 453a084..5bd7a44 100644 --- a/lexdecomp/compmodel.py +++ b/lexdecomp/compmodel.py @@ -35,7 +35,7 @@ def conv_layer(in_data): embedding_size = in_data.shape[1] sequence_length = in_data.shape[2] in_channels = in_data.shape[3] - if hasattr(in_data, 'get_shape'): # a TensorFlow placeholder + elif hasattr(in_data, 'get_shape'): # a TensorFlow placeholder embedding_size = in_data.get_shape()[1].value sequence_length = in_data.get_shape()[2].value in_channels = in_data.get_shape()[3].value