diff --git a/lexdecomp/compmodel.py b/lexdecomp/compmodel.py index 738120d..5bd7a44 100644 --- a/lexdecomp/compmodel.py +++ b/lexdecomp/compmodel.py @@ -58,7 +58,8 @@ def conv_layer(in_data): pooled_outputs.append(pooled) # concatenating feature maps - features = tf.concat(3, pooled_outputs) + features = tf.concat(pooled_outputs,axis = 3) +# features = [pooled_outputs[0], pooled_outputs[1], pooled_outputs[2]] total_feature_maps = len(FILTER_SIZES) * NUM_FEATURE_MAPS features_flat = tf.reshape(features, [-1, total_feature_maps]) # features_flat.shape = [batch, total_feature_maps] @@ -88,7 +89,8 @@ def inference(questions, sentences, keep_prob): """ question_features = conv_layer(questions) sentence_features = conv_layer(sentences) - features = tf.concat(1, [question_features, sentence_features]) + features = tf.concat([question_features, sentence_features],axis=1) +# features = [1, question_features, sentence_features] scores = hidden_layer(features, keep_prob) return scores diff --git a/lexdecomp/compmodel.pyc b/lexdecomp/compmodel.pyc new file mode 100644 index 0000000..d32a0f1 Binary files /dev/null and b/lexdecomp/compmodel.pyc differ diff --git a/lexdecomp/train.py b/lexdecomp/train.py index a8f4a77..b379343 100644 --- a/lexdecomp/train.py +++ b/lexdecomp/train.py @@ -78,12 +78,15 @@ def run_training(training_data, dev_data, test_data, model_dir): [None, EMBEDDING_SIZE, max_sentence, IN_CHANNELS]) labels = tf.placeholder(tf.float32, [None]) keep_prob = tf.placeholder(tf.float32, name='keep_prob') - + # building the graph + print('HERE!-0') logits = compmodel.inference(questions, sentences, keep_prob) + print('HERE!-1') loss = compmodel.loss(logits, labels) + print('HERE!-2') train_op = compmodel.training(loss) - + print('HERE!') saver = tf.train.Saver() bestdev_model_file = Path(model_dir, 'best-dev_model.ckpt').as_posix() @@ -154,27 +157,33 @@ def evaluate(session, dataset, data_label, model_label): def main(): - parser = argparse.ArgumentParser( - description='Trains the sentence composition model (CNN).') - parser.add_argument('training', help='training set (.hdf5)') - parser.add_argument('dev', help='dev set (.hdf5)') - parser.add_argument('test', help='test set (.hdf5)') - parser.add_argument('model_dir', help='directory to save models') - args = parser.parse_args() - +# parser = argparse.ArgumentParser( +# description='Trains the sentence composition model (CNN).') +# parser.add_argument('training', help='training set (.hdf5)') +# parser.add_argument('dev', help='dev set (.hdf5)') +# parser.add_argument('test', help='test set (.hdf5)') +# parser.add_argument('model_dir', help='directory to save models') +# args = parser.parse_args() +# args.append('../train-filtered.hdf5') +# args.append('../dev-filtered.hdf5') +# args.append('../test-filtered.hdf5') +# args.append('../saved-model') # checking model directory - model_dir = Path(args.model_dir) +# print ('Yes1') + model_dir = Path('../saved-model') if not model_dir.exists(): model_dir.mkdir() # data files - training_data = h5py.File(args.training) - dev_data = h5py.File(args.dev) - test_data = h5py.File(args.test) +# print ('Yes2') + training_data = h5py.File('../train-filtered.hdf5') + dev_data = h5py.File('../dev-filtered.hdf5') + test_data = h5py.File('../test-filtered.hdf5') try: +# print ('Yes3') run_training(training_data, dev_data, test_data, - args.model_dir) + '../saved-model') finally: training_data.close() dev_data.close() diff --git a/tools/text2numpy.py b/tools/text2numpy.py index 9eb7f52..ef22767 100644 --- a/tools/text2numpy.py +++ b/tools/text2numpy.py @@ -1,3 +1,4 @@ +import sys import argparse import re import os diff --git a/tools/word2vec2text.py b/tools/word2vec2text.py index 6ebbb4a..5a0a31c 100644 --- a/tools/word2vec2text.py +++ b/tools/word2vec2text.py @@ -56,12 +56,16 @@ def memorymap(filename): if end == -1: break wordbytes = mvec[pos:end] - word = wordbytes.decode('utf-8', errors='replace').strip() - # reading the corresponding vector - pos = end + 1 - end = pos + byte_offset - vector = array('f', mvec[pos:end]) - if vocabulary is not None and word not in vocabulary: - continue # skip word if not in vocabulary - print(word, ' '.join(map(str, vector)), file=fout) + try: + word = wordbytes.decode('utf-8', errors='replace').strip() + # reading the corresponding vector + pos = end + 1 + end = pos + byte_offset + vector = array('f', mvec[pos:end]) + if vocabulary is not None and word not in vocabulary: + continue # skip word if not in vocabulary + else: + print(word, ' '.join(map(str, vector)), file=fout) + except: + continue print('finished') diff --git a/trec-qa/code.py b/trec-qa/code.py new file mode 100644 index 0000000..fbbf933 --- /dev/null +++ b/trec-qa/code.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +f = open('train-filtered.tsv') + +lines = f.readlines() + +lines = [line.split('\t') for line in lines] \ No newline at end of file