diff --git a/.gitignore b/.gitignore index f3077b3..64c347b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,30 +1,53 @@ -Makefile -TurboParser -TurboTagger -autom4te.cache -compile -config.h -config.log -config.status -deps/AD3-2.0.2/ -deps/gflags-2.0/ -deps/glog-0.3.2/ -deps/eigen-eigen-c58038c56923/ -deps/local/ -data_local/ -ner/ -*.o -*.obj -*.tlog -stamp-h1 -.deps -vsprojects/*/x64 -vsprojects/x64/* -vsprojects/*/Debug -vsprojects/Debug/* -vsprojects/*/Release -vsprojects/Release/* -vsprojects/*/*.vcxproj.user -vsprojects/*/*.vcxproj.filters -vsprojects/*.sdf -vsprojects/*.opensdf \ No newline at end of file +Makefile +TurboParser +TurboTagger +autom4te.cache +compile +config.h +config.log +config.status +deps/AD3-2.0.2/ +deps/gflags-2.0/ +deps/glog-0.3.2/ +deps/googletest/ +deps/eigen-eigen-c58038c56923/ +deps/local/ +data_local/ +ner/ +*.o +*.obj +*.tlog +stamp-h1 +.deps +vsprojects/*.suo +vsprojects/*.user +vsprojects/*.userosscache +vsprojects/*.sln.docstates +vsprojects/*.aps +vsprojects/*.ncb +vsprojects/*.opendb +vsprojects/*.VC.db +vsprojects/*.opensdf +vsprojects/*.sdf +vsprojects/*.cachefile +vsprojects/*.psess +vsprojects/*.vsp +vsprojects/*.vspx +vsprojects/*/*.vcxproj.user +vsprojects/*/*.vcxproj.filters +vsprojects/*/x64 +vsprojects/x64/* +vsprojects/*/Debug +vsprojects/Debug/* +vsprojects/*/Release +vsprojects/Release/* +/python/tokenizers/portuguese/__pycache__ +/python/__pycache__ +/python/cython_debug +/python/build +/python/tokenizers/__pycache__ +/python/turboparser.cp35-win_amd64.pyd +/python/turboparser.cp35-win_amd64.pdb +/python/vc140.pdb +/python/turboparser.cpp +/python/test2.py diff --git a/aclocal.m4 b/aclocal.m4 index 3a9a040..e60dd8d 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1,1149 +1,1149 @@ -# generated automatically by aclocal 1.14.1 -*- Autoconf -*- - -# Copyright (C) 1996-2013 Free Software Foundation, Inc. - -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. - -m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])]) -m4_ifndef([AC_AUTOCONF_VERSION], - [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl -m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],, -[m4_warning([this file was generated for autoconf 2.69. -You have another version of autoconf. It may work, but is not guaranteed to. -If you have problems, you may need to regenerate the build system entirely. -To do so, use the procedure documented by the package, typically 'autoreconf'.])]) - -# Copyright (C) 2002-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_AUTOMAKE_VERSION(VERSION) -# ---------------------------- -# Automake X.Y traces this macro to ensure aclocal.m4 has been -# generated from the m4 files accompanying Automake X.Y. -# (This private macro should not be called outside this file.) -AC_DEFUN([AM_AUTOMAKE_VERSION], -[am__api_version='1.14' -dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to -dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.14.1], [], - [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl -]) - -# _AM_AUTOCONF_VERSION(VERSION) -# ----------------------------- -# aclocal traces this macro to find the Autoconf version. -# This is a private macro too. Using m4_define simplifies -# the logic in aclocal, which can simply ignore this definition. -m4_define([_AM_AUTOCONF_VERSION], []) - -# AM_SET_CURRENT_AUTOMAKE_VERSION -# ------------------------------- -# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. -# This function is AC_REQUIREd by AM_INIT_AUTOMAKE. -AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.14.1])dnl -m4_ifndef([AC_AUTOCONF_VERSION], - [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl -_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) - -# AM_AUX_DIR_EXPAND -*- Autoconf -*- - -# Copyright (C) 2001-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets -# $ac_aux_dir to '$srcdir/foo'. In other projects, it is set to -# '$srcdir', '$srcdir/..', or '$srcdir/../..'. -# -# Of course, Automake must honor this variable whenever it calls a -# tool from the auxiliary directory. The problem is that $srcdir (and -# therefore $ac_aux_dir as well) can be either absolute or relative, -# depending on how configure is run. This is pretty annoying, since -# it makes $ac_aux_dir quite unusable in subdirectories: in the top -# source directory, any form will work fine, but in subdirectories a -# relative path needs to be adjusted first. -# -# $ac_aux_dir/missing -# fails when called from a subdirectory if $ac_aux_dir is relative -# $top_srcdir/$ac_aux_dir/missing -# fails if $ac_aux_dir is absolute, -# fails when called from a subdirectory in a VPATH build with -# a relative $ac_aux_dir -# -# The reason of the latter failure is that $top_srcdir and $ac_aux_dir -# are both prefixed by $srcdir. In an in-source build this is usually -# harmless because $srcdir is '.', but things will broke when you -# start a VPATH build or use an absolute $srcdir. -# -# So we could use something similar to $top_srcdir/$ac_aux_dir/missing, -# iff we strip the leading $srcdir from $ac_aux_dir. That would be: -# am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"` -# and then we would define $MISSING as -# MISSING="\${SHELL} $am_aux_dir/missing" -# This will work as long as MISSING is not called from configure, because -# unfortunately $(top_srcdir) has no meaning in configure. -# However there are other variables, like CC, which are often used in -# configure, and could therefore not use this "fixed" $ac_aux_dir. -# -# Another solution, used here, is to always expand $ac_aux_dir to an -# absolute PATH. The drawback is that using absolute paths prevent a -# configured tree to be moved without reconfiguration. - -AC_DEFUN([AM_AUX_DIR_EXPAND], -[dnl Rely on autoconf to set up CDPATH properly. -AC_PREREQ([2.50])dnl -# expand $ac_aux_dir to an absolute path -am_aux_dir=`cd $ac_aux_dir && pwd` -]) - -# AM_CONDITIONAL -*- Autoconf -*- - -# Copyright (C) 1997-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_CONDITIONAL(NAME, SHELL-CONDITION) -# ------------------------------------- -# Define a conditional. -AC_DEFUN([AM_CONDITIONAL], -[AC_PREREQ([2.52])dnl - m4_if([$1], [TRUE], [AC_FATAL([$0: invalid condition: $1])], - [$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl -AC_SUBST([$1_TRUE])dnl -AC_SUBST([$1_FALSE])dnl -_AM_SUBST_NOTMAKE([$1_TRUE])dnl -_AM_SUBST_NOTMAKE([$1_FALSE])dnl -m4_define([_AM_COND_VALUE_$1], [$2])dnl -if $2; then - $1_TRUE= - $1_FALSE='#' -else - $1_TRUE='#' - $1_FALSE= -fi -AC_CONFIG_COMMANDS_PRE( -[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then - AC_MSG_ERROR([[conditional "$1" was never defined. -Usually this means the macro was only invoked conditionally.]]) -fi])]) - -# Copyright (C) 1999-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - - -# There are a few dirty hacks below to avoid letting 'AC_PROG_CC' be -# written in clear, in which case automake, when reading aclocal.m4, -# will think it sees a *use*, and therefore will trigger all it's -# C support machinery. Also note that it means that autoscan, seeing -# CC etc. in the Makefile, will ask for an AC_PROG_CC use... - - -# _AM_DEPENDENCIES(NAME) -# ---------------------- -# See how the compiler implements dependency checking. -# NAME is "CC", "CXX", "OBJC", "OBJCXX", "UPC", or "GJC". -# We try a few techniques and use that to set a single cache variable. -# -# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was -# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular -# dependency, and given that the user is not expected to run this macro, -# just rely on AC_PROG_CC. -AC_DEFUN([_AM_DEPENDENCIES], -[AC_REQUIRE([AM_SET_DEPDIR])dnl -AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl -AC_REQUIRE([AM_MAKE_INCLUDE])dnl -AC_REQUIRE([AM_DEP_TRACK])dnl - -m4_if([$1], [CC], [depcc="$CC" am_compiler_list=], - [$1], [CXX], [depcc="$CXX" am_compiler_list=], - [$1], [OBJC], [depcc="$OBJC" am_compiler_list='gcc3 gcc'], - [$1], [OBJCXX], [depcc="$OBJCXX" am_compiler_list='gcc3 gcc'], - [$1], [UPC], [depcc="$UPC" am_compiler_list=], - [$1], [GCJ], [depcc="$GCJ" am_compiler_list='gcc3 gcc'], - [depcc="$$1" am_compiler_list=]) - -AC_CACHE_CHECK([dependency style of $depcc], - [am_cv_$1_dependencies_compiler_type], -[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then - # We make a subdir and do the tests there. Otherwise we can end up - # making bogus files that we don't know about and never remove. For - # instance it was reported that on HP-UX the gcc test will end up - # making a dummy file named 'D' -- because '-MD' means "put the output - # in D". - rm -rf conftest.dir - mkdir conftest.dir - # Copy depcomp to subdir because otherwise we won't find it if we're - # using a relative directory. - cp "$am_depcomp" conftest.dir - cd conftest.dir - # We will build objects and dependencies in a subdirectory because - # it helps to detect inapplicable dependency modes. For instance - # both Tru64's cc and ICC support -MD to output dependencies as a - # side effect of compilation, but ICC will put the dependencies in - # the current directory while Tru64 will put them in the object - # directory. - mkdir sub - - am_cv_$1_dependencies_compiler_type=none - if test "$am_compiler_list" = ""; then - am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp` - fi - am__universal=false - m4_case([$1], [CC], - [case " $depcc " in #( - *\ -arch\ *\ -arch\ *) am__universal=true ;; - esac], - [CXX], - [case " $depcc " in #( - *\ -arch\ *\ -arch\ *) am__universal=true ;; - esac]) - - for depmode in $am_compiler_list; do - # Setup a source with many dependencies, because some compilers - # like to wrap large dependency lists on column 80 (with \), and - # we should not choose a depcomp mode which is confused by this. - # - # We need to recreate these files for each test, as the compiler may - # overwrite some of them when testing with obscure command lines. - # This happens at least with the AIX C compiler. - : > sub/conftest.c - for i in 1 2 3 4 5 6; do - echo '#include "conftst'$i'.h"' >> sub/conftest.c - # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with - # Solaris 10 /bin/sh. - echo '/* dummy */' > sub/conftst$i.h - done - echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf - - # We check with '-c' and '-o' for the sake of the "dashmstdout" - # mode. It turns out that the SunPro C++ compiler does not properly - # handle '-M -o', and we need to detect this. Also, some Intel - # versions had trouble with output in subdirs. - am__obj=sub/conftest.${OBJEXT-o} - am__minus_obj="-o $am__obj" - case $depmode in - gcc) - # This depmode causes a compiler race in universal mode. - test "$am__universal" = false || continue - ;; - nosideeffect) - # After this tag, mechanisms are not by side-effect, so they'll - # only be used when explicitly requested. - if test "x$enable_dependency_tracking" = xyes; then - continue - else - break - fi - ;; - msvc7 | msvc7msys | msvisualcpp | msvcmsys) - # This compiler won't grok '-c -o', but also, the minuso test has - # not run yet. These depmodes are late enough in the game, and - # so weak that their functioning should not be impacted. - am__obj=conftest.${OBJEXT-o} - am__minus_obj= - ;; - none) break ;; - esac - if depmode=$depmode \ - source=sub/conftest.c object=$am__obj \ - depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ - $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \ - >/dev/null 2>conftest.err && - grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 && - grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && - grep $am__obj sub/conftest.Po > /dev/null 2>&1 && - ${MAKE-make} -s -f confmf > /dev/null 2>&1; then - # icc doesn't choke on unknown options, it will just issue warnings - # or remarks (even with -Werror). So we grep stderr for any message - # that says an option was ignored or not supported. - # When given -MP, icc 7.0 and 7.1 complain thusly: - # icc: Command line warning: ignoring option '-M'; no argument required - # The diagnosis changed in icc 8.0: - # icc: Command line remark: option '-MP' not supported - if (grep 'ignoring option' conftest.err || - grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else - am_cv_$1_dependencies_compiler_type=$depmode - break - fi - fi - done - - cd .. - rm -rf conftest.dir -else - am_cv_$1_dependencies_compiler_type=none -fi -]) -AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type]) -AM_CONDITIONAL([am__fastdep$1], [ - test "x$enable_dependency_tracking" != xno \ - && test "$am_cv_$1_dependencies_compiler_type" = gcc3]) -]) - - -# AM_SET_DEPDIR -# ------------- -# Choose a directory name for dependency files. -# This macro is AC_REQUIREd in _AM_DEPENDENCIES. -AC_DEFUN([AM_SET_DEPDIR], -[AC_REQUIRE([AM_SET_LEADING_DOT])dnl -AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl -]) - - -# AM_DEP_TRACK -# ------------ -AC_DEFUN([AM_DEP_TRACK], -[AC_ARG_ENABLE([dependency-tracking], [dnl -AS_HELP_STRING( - [--enable-dependency-tracking], - [do not reject slow dependency extractors]) -AS_HELP_STRING( - [--disable-dependency-tracking], - [speeds up one-time build])]) -if test "x$enable_dependency_tracking" != xno; then - am_depcomp="$ac_aux_dir/depcomp" - AMDEPBACKSLASH='\' - am__nodep='_no' -fi -AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno]) -AC_SUBST([AMDEPBACKSLASH])dnl -_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl -AC_SUBST([am__nodep])dnl -_AM_SUBST_NOTMAKE([am__nodep])dnl -]) - -# Generate code to set up dependency tracking. -*- Autoconf -*- - -# Copyright (C) 1999-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - - -# _AM_OUTPUT_DEPENDENCY_COMMANDS -# ------------------------------ -AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS], -[{ - # Older Autoconf quotes --file arguments for eval, but not when files - # are listed without --file. Let's play safe and only enable the eval - # if we detect the quoting. - case $CONFIG_FILES in - *\'*) eval set x "$CONFIG_FILES" ;; - *) set x $CONFIG_FILES ;; - esac - shift - for mf - do - # Strip MF so we end up with the name of the file. - mf=`echo "$mf" | sed -e 's/:.*$//'` - # Check whether this is an Automake generated Makefile or not. - # We used to match only the files named 'Makefile.in', but - # some people rename them; so instead we look at the file content. - # Grep'ing the first line is not enough: some people post-process - # each Makefile.in and add a new line on top of each file to say so. - # Grep'ing the whole file is not good either: AIX grep has a line - # limit of 2048, but all sed's we know have understand at least 4000. - if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then - dirpart=`AS_DIRNAME("$mf")` - else - continue - fi - # Extract the definition of DEPDIR, am__include, and am__quote - # from the Makefile without running 'make'. - DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"` - test -z "$DEPDIR" && continue - am__include=`sed -n 's/^am__include = //p' < "$mf"` - test -z "$am__include" && continue - am__quote=`sed -n 's/^am__quote = //p' < "$mf"` - # Find all dependency output files, they are included files with - # $(DEPDIR) in their names. We invoke sed twice because it is the - # simplest approach to changing $(DEPDIR) to its actual value in the - # expansion. - for file in `sed -n " - s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \ - sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do - # Make sure the directory exists. - test -f "$dirpart/$file" && continue - fdir=`AS_DIRNAME(["$file"])` - AS_MKDIR_P([$dirpart/$fdir]) - # echo "creating $dirpart/$file" - echo '# dummy' > "$dirpart/$file" - done - done -} -])# _AM_OUTPUT_DEPENDENCY_COMMANDS - - -# AM_OUTPUT_DEPENDENCY_COMMANDS -# ----------------------------- -# This macro should only be invoked once -- use via AC_REQUIRE. -# -# This code is only required when automatic dependency tracking -# is enabled. FIXME. This creates each '.P' file that we will -# need in order to bootstrap the dependency handling code. -AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], -[AC_CONFIG_COMMANDS([depfiles], - [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS], - [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"]) -]) - -# Do all the work for Automake. -*- Autoconf -*- - -# Copyright (C) 1996-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This macro actually does too much. Some checks are only needed if -# your package does certain things. But this isn't really a big deal. - -dnl Redefine AC_PROG_CC to automatically invoke _AM_PROG_CC_C_O. -m4_define([AC_PROG_CC], -m4_defn([AC_PROG_CC]) -[_AM_PROG_CC_C_O -]) - -# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE]) -# AM_INIT_AUTOMAKE([OPTIONS]) -# ----------------------------------------------- -# The call with PACKAGE and VERSION arguments is the old style -# call (pre autoconf-2.50), which is being phased out. PACKAGE -# and VERSION should now be passed to AC_INIT and removed from -# the call to AM_INIT_AUTOMAKE. -# We support both call styles for the transition. After -# the next Automake release, Autoconf can make the AC_INIT -# arguments mandatory, and then we can depend on a new Autoconf -# release and drop the old call support. -AC_DEFUN([AM_INIT_AUTOMAKE], -[AC_PREREQ([2.65])dnl -dnl Autoconf wants to disallow AM_ names. We explicitly allow -dnl the ones we care about. -m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl -AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl -AC_REQUIRE([AC_PROG_INSTALL])dnl -if test "`cd $srcdir && pwd`" != "`pwd`"; then - # Use -I$(srcdir) only when $(srcdir) != ., so that make's output - # is not polluted with repeated "-I." - AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl - # test to see if srcdir already configured - if test -f $srcdir/config.status; then - AC_MSG_ERROR([source directory already configured; run "make distclean" there first]) - fi -fi - -# test whether we have cygpath -if test -z "$CYGPATH_W"; then - if (cygpath --version) >/dev/null 2>/dev/null; then - CYGPATH_W='cygpath -w' - else - CYGPATH_W=echo - fi -fi -AC_SUBST([CYGPATH_W]) - -# Define the identity of the package. -dnl Distinguish between old-style and new-style calls. -m4_ifval([$2], -[AC_DIAGNOSE([obsolete], - [$0: two- and three-arguments forms are deprecated.]) -m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl - AC_SUBST([PACKAGE], [$1])dnl - AC_SUBST([VERSION], [$2])], -[_AM_SET_OPTIONS([$1])dnl -dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT. -m4_if( - m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]), - [ok:ok],, - [m4_fatal([AC_INIT should be called with package and version arguments])])dnl - AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl - AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl - -_AM_IF_OPTION([no-define],, -[AC_DEFINE_UNQUOTED([PACKAGE], ["$PACKAGE"], [Name of package]) - AC_DEFINE_UNQUOTED([VERSION], ["$VERSION"], [Version number of package])])dnl - -# Some tools Automake needs. -AC_REQUIRE([AM_SANITY_CHECK])dnl -AC_REQUIRE([AC_ARG_PROGRAM])dnl -AM_MISSING_PROG([ACLOCAL], [aclocal-${am__api_version}]) -AM_MISSING_PROG([AUTOCONF], [autoconf]) -AM_MISSING_PROG([AUTOMAKE], [automake-${am__api_version}]) -AM_MISSING_PROG([AUTOHEADER], [autoheader]) -AM_MISSING_PROG([MAKEINFO], [makeinfo]) -AC_REQUIRE([AM_PROG_INSTALL_SH])dnl -AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl -AC_REQUIRE([AC_PROG_MKDIR_P])dnl -# For better backward compatibility. To be removed once Automake 1.9.x -# dies out for good. For more background, see: -# -# -AC_SUBST([mkdir_p], ['$(MKDIR_P)']) -# We need awk for the "check" target. The system "awk" is bad on -# some platforms. -AC_REQUIRE([AC_PROG_AWK])dnl -AC_REQUIRE([AC_PROG_MAKE_SET])dnl -AC_REQUIRE([AM_SET_LEADING_DOT])dnl -_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])], - [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])], - [_AM_PROG_TAR([v7])])]) -_AM_IF_OPTION([no-dependencies],, -[AC_PROVIDE_IFELSE([AC_PROG_CC], - [_AM_DEPENDENCIES([CC])], - [m4_define([AC_PROG_CC], - m4_defn([AC_PROG_CC])[_AM_DEPENDENCIES([CC])])])dnl -AC_PROVIDE_IFELSE([AC_PROG_CXX], - [_AM_DEPENDENCIES([CXX])], - [m4_define([AC_PROG_CXX], - m4_defn([AC_PROG_CXX])[_AM_DEPENDENCIES([CXX])])])dnl -AC_PROVIDE_IFELSE([AC_PROG_OBJC], - [_AM_DEPENDENCIES([OBJC])], - [m4_define([AC_PROG_OBJC], - m4_defn([AC_PROG_OBJC])[_AM_DEPENDENCIES([OBJC])])])dnl -AC_PROVIDE_IFELSE([AC_PROG_OBJCXX], - [_AM_DEPENDENCIES([OBJCXX])], - [m4_define([AC_PROG_OBJCXX], - m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl -]) -AC_REQUIRE([AM_SILENT_RULES])dnl -dnl The testsuite driver may need to know about EXEEXT, so add the -dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This -dnl macro is hooked onto _AC_COMPILER_EXEEXT early, see below. -AC_CONFIG_COMMANDS_PRE(dnl -[m4_provide_if([_AM_COMPILER_EXEEXT], - [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl - -# POSIX will say in a future version that running "rm -f" with no argument -# is OK; and we want to be able to make that assumption in our Makefile -# recipes. So use an aggressive probe to check that the usage we want is -# actually supported "in the wild" to an acceptable degree. -# See automake bug#10828. -# To make any issue more visible, cause the running configure to be aborted -# by default if the 'rm' program in use doesn't match our expectations; the -# user can still override this though. -if rm -f && rm -fr && rm -rf; then : OK; else - cat >&2 <<'END' -Oops! - -Your 'rm' program seems unable to run without file operands specified -on the command line, even when the '-f' option is present. This is contrary -to the behaviour of most rm programs out there, and not conforming with -the upcoming POSIX standard: - -Please tell bug-automake@gnu.org about your system, including the value -of your $PATH and any error possibly output before this message. This -can help us improve future automake versions. - -END - if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then - echo 'Configuration will proceed anyway, since you have set the' >&2 - echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2 - echo >&2 - else - cat >&2 <<'END' -Aborting the configuration process, to ensure you take notice of the issue. - -You can download and install GNU coreutils to get an 'rm' implementation -that behaves properly: . - -If you want to complete the configuration process using your problematic -'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM -to "yes", and re-run configure. - -END - AC_MSG_ERROR([Your 'rm' program is bad, sorry.]) - fi -fi]) - -dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not -dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further -dnl mangled by Autoconf and run in a shell conditional statement. -m4_define([_AC_COMPILER_EXEEXT], -m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])]) - -# When config.status generates a header, we must update the stamp-h file. -# This file resides in the same directory as the config header -# that is generated. The stamp files are numbered to have different names. - -# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the -# loop where config.status creates the headers, so we can generate -# our stamp files there. -AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK], -[# Compute $1's index in $config_headers. -_am_arg=$1 -_am_stamp_count=1 -for _am_header in $config_headers :; do - case $_am_header in - $_am_arg | $_am_arg:* ) - break ;; - * ) - _am_stamp_count=`expr $_am_stamp_count + 1` ;; - esac -done -echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) - -# Copyright (C) 2001-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_PROG_INSTALL_SH -# ------------------ -# Define $install_sh. -AC_DEFUN([AM_PROG_INSTALL_SH], -[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -if test x"${install_sh}" != xset; then - case $am_aux_dir in - *\ * | *\ *) - install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; - *) - install_sh="\${SHELL} $am_aux_dir/install-sh" - esac -fi -AC_SUBST([install_sh])]) - -# Copyright (C) 2003-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# Check whether the underlying file-system supports filenames -# with a leading dot. For instance MS-DOS doesn't. -AC_DEFUN([AM_SET_LEADING_DOT], -[rm -rf .tst 2>/dev/null -mkdir .tst 2>/dev/null -if test -d .tst; then - am__leading_dot=. -else - am__leading_dot=_ -fi -rmdir .tst 2>/dev/null -AC_SUBST([am__leading_dot])]) - -# Check to see how 'make' treats includes. -*- Autoconf -*- - -# Copyright (C) 2001-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_MAKE_INCLUDE() -# ----------------- -# Check to see how make treats includes. -AC_DEFUN([AM_MAKE_INCLUDE], -[am_make=${MAKE-make} -cat > confinc << 'END' -am__doit: - @echo this is the am__doit target -.PHONY: am__doit -END -# If we don't find an include directive, just comment out the code. -AC_MSG_CHECKING([for style of include used by $am_make]) -am__include="#" -am__quote= -_am_result=none -# First try GNU make style include. -echo "include confinc" > confmf -# Ignore all kinds of additional output from 'make'. -case `$am_make -s -f confmf 2> /dev/null` in #( -*the\ am__doit\ target*) - am__include=include - am__quote= - _am_result=GNU - ;; -esac -# Now try BSD make style include. -if test "$am__include" = "#"; then - echo '.include "confinc"' > confmf - case `$am_make -s -f confmf 2> /dev/null` in #( - *the\ am__doit\ target*) - am__include=.include - am__quote="\"" - _am_result=BSD - ;; - esac -fi -AC_SUBST([am__include]) -AC_SUBST([am__quote]) -AC_MSG_RESULT([$_am_result]) -rm -f confinc confmf -]) - -# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- - -# Copyright (C) 1997-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_MISSING_PROG(NAME, PROGRAM) -# ------------------------------ -AC_DEFUN([AM_MISSING_PROG], -[AC_REQUIRE([AM_MISSING_HAS_RUN]) -$1=${$1-"${am_missing_run}$2"} -AC_SUBST($1)]) - -# AM_MISSING_HAS_RUN -# ------------------ -# Define MISSING if not defined so far and test if it is modern enough. -# If it is, set am_missing_run to use it, otherwise, to nothing. -AC_DEFUN([AM_MISSING_HAS_RUN], -[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -AC_REQUIRE_AUX_FILE([missing])dnl -if test x"${MISSING+set}" != xset; then - case $am_aux_dir in - *\ * | *\ *) - MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;; - *) - MISSING="\${SHELL} $am_aux_dir/missing" ;; - esac -fi -# Use eval to expand $SHELL -if eval "$MISSING --is-lightweight"; then - am_missing_run="$MISSING " -else - am_missing_run= - AC_MSG_WARN(['missing' script is too old or missing]) -fi -]) - -# Helper functions for option handling. -*- Autoconf -*- - -# Copyright (C) 2001-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_MANGLE_OPTION(NAME) -# ----------------------- -AC_DEFUN([_AM_MANGLE_OPTION], -[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])]) - -# _AM_SET_OPTION(NAME) -# -------------------- -# Set option NAME. Presently that only means defining a flag for this option. -AC_DEFUN([_AM_SET_OPTION], -[m4_define(_AM_MANGLE_OPTION([$1]), [1])]) - -# _AM_SET_OPTIONS(OPTIONS) -# ------------------------ -# OPTIONS is a space-separated list of Automake options. -AC_DEFUN([_AM_SET_OPTIONS], -[m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])]) - -# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET]) -# ------------------------------------------- -# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise. -AC_DEFUN([_AM_IF_OPTION], -[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])]) - -# Copyright (C) 1999-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_PROG_CC_C_O -# --------------- -# Like AC_PROG_CC_C_O, but changed for automake. We rewrite AC_PROG_CC -# to automatically call this. -AC_DEFUN([_AM_PROG_CC_C_O], -[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -AC_REQUIRE_AUX_FILE([compile])dnl -AC_LANG_PUSH([C])dnl -AC_CACHE_CHECK( - [whether $CC understands -c and -o together], - [am_cv_prog_cc_c_o], - [AC_LANG_CONFTEST([AC_LANG_PROGRAM([])]) - # Make sure it works both with $CC and with simple cc. - # Following AC_PROG_CC_C_O, we do the test twice because some - # compilers refuse to overwrite an existing .o file with -o, - # though they will create one. - am_cv_prog_cc_c_o=yes - for am_i in 1 2; do - if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \ - && test -f conftest2.$ac_objext; then - : OK - else - am_cv_prog_cc_c_o=no - break - fi - done - rm -f core conftest* - unset am_i]) -if test "$am_cv_prog_cc_c_o" != yes; then - # Losing compiler, so override with the script. - # FIXME: It is wrong to rewrite CC. - # But if we don't then we get into trouble of one sort or another. - # A longer-term fix would be to have automake use am__CC in this case, - # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)" - CC="$am_aux_dir/compile $CC" -fi -AC_LANG_POP([C])]) - -# For backward compatibility. -AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])]) - -# Copyright (C) 2001-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_RUN_LOG(COMMAND) -# ------------------- -# Run COMMAND, save the exit status in ac_status, and log it. -# (This has been adapted from Autoconf's _AC_RUN_LOG macro.) -AC_DEFUN([AM_RUN_LOG], -[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD - ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD - ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD - (exit $ac_status); }]) - -# Check to make sure that the build environment is sane. -*- Autoconf -*- - -# Copyright (C) 1996-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_SANITY_CHECK -# --------------- -AC_DEFUN([AM_SANITY_CHECK], -[AC_MSG_CHECKING([whether build environment is sane]) -# Reject unsafe characters in $srcdir or the absolute working directory -# name. Accept space and tab only in the latter. -am_lf=' -' -case `pwd` in - *[[\\\"\#\$\&\'\`$am_lf]]*) - AC_MSG_ERROR([unsafe absolute working directory name]);; -esac -case $srcdir in - *[[\\\"\#\$\&\'\`$am_lf\ \ ]]*) - AC_MSG_ERROR([unsafe srcdir value: '$srcdir']);; -esac - -# Do 'set' in a subshell so we don't clobber the current shell's -# arguments. Must try -L first in case configure is actually a -# symlink; some systems play weird games with the mod time of symlinks -# (eg FreeBSD returns the mod time of the symlink's containing -# directory). -if ( - am_has_slept=no - for am_try in 1 2; do - echo "timestamp, slept: $am_has_slept" > conftest.file - set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null` - if test "$[*]" = "X"; then - # -L didn't work. - set X `ls -t "$srcdir/configure" conftest.file` - fi - if test "$[*]" != "X $srcdir/configure conftest.file" \ - && test "$[*]" != "X conftest.file $srcdir/configure"; then - - # If neither matched, then we have a broken ls. This can happen - # if, for instance, CONFIG_SHELL is bash and it inherits a - # broken ls alias from the environment. This has actually - # happened. Such a system could not be considered "sane". - AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken - alias in your environment]) - fi - if test "$[2]" = conftest.file || test $am_try -eq 2; then - break - fi - # Just in case. - sleep 1 - am_has_slept=yes - done - test "$[2]" = conftest.file - ) -then - # Ok. - : -else - AC_MSG_ERROR([newly created file is older than distributed files! -Check your system clock]) -fi -AC_MSG_RESULT([yes]) -# If we didn't sleep, we still need to ensure time stamps of config.status and -# generated files are strictly newer. -am_sleep_pid= -if grep 'slept: no' conftest.file >/dev/null 2>&1; then - ( sleep 1 ) & - am_sleep_pid=$! -fi -AC_CONFIG_COMMANDS_PRE( - [AC_MSG_CHECKING([that generated files are newer than configure]) - if test -n "$am_sleep_pid"; then - # Hide warnings about reused PIDs. - wait $am_sleep_pid 2>/dev/null - fi - AC_MSG_RESULT([done])]) -rm -f conftest.file -]) - -# Copyright (C) 2009-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_SILENT_RULES([DEFAULT]) -# -------------------------- -# Enable less verbose build rules; with the default set to DEFAULT -# ("yes" being less verbose, "no" or empty being verbose). -AC_DEFUN([AM_SILENT_RULES], -[AC_ARG_ENABLE([silent-rules], [dnl -AS_HELP_STRING( - [--enable-silent-rules], - [less verbose build output (undo: "make V=1")]) -AS_HELP_STRING( - [--disable-silent-rules], - [verbose build output (undo: "make V=0")])dnl -]) -case $enable_silent_rules in @%:@ ((( - yes) AM_DEFAULT_VERBOSITY=0;; - no) AM_DEFAULT_VERBOSITY=1;; - *) AM_DEFAULT_VERBOSITY=m4_if([$1], [yes], [0], [1]);; -esac -dnl -dnl A few 'make' implementations (e.g., NonStop OS and NextStep) -dnl do not support nested variable expansions. -dnl See automake bug#9928 and bug#10237. -am_make=${MAKE-make} -AC_CACHE_CHECK([whether $am_make supports nested variables], - [am_cv_make_support_nested_variables], - [if AS_ECHO([['TRUE=$(BAR$(V)) -BAR0=false -BAR1=true -V=1 -am__doit: - @$(TRUE) -.PHONY: am__doit']]) | $am_make -f - >/dev/null 2>&1; then - am_cv_make_support_nested_variables=yes -else - am_cv_make_support_nested_variables=no -fi]) -if test $am_cv_make_support_nested_variables = yes; then - dnl Using '$V' instead of '$(V)' breaks IRIX make. - AM_V='$(V)' - AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)' -else - AM_V=$AM_DEFAULT_VERBOSITY - AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY -fi -AC_SUBST([AM_V])dnl -AM_SUBST_NOTMAKE([AM_V])dnl -AC_SUBST([AM_DEFAULT_V])dnl -AM_SUBST_NOTMAKE([AM_DEFAULT_V])dnl -AC_SUBST([AM_DEFAULT_VERBOSITY])dnl -AM_BACKSLASH='\' -AC_SUBST([AM_BACKSLASH])dnl -_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl -]) - -# Copyright (C) 2001-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# AM_PROG_INSTALL_STRIP -# --------------------- -# One issue with vendor 'install' (even GNU) is that you can't -# specify the program used to strip binaries. This is especially -# annoying in cross-compiling environments, where the build's strip -# is unlikely to handle the host's binaries. -# Fortunately install-sh will honor a STRIPPROG variable, so we -# always use install-sh in "make install-strip", and initialize -# STRIPPROG with the value of the STRIP variable (set by the user). -AC_DEFUN([AM_PROG_INSTALL_STRIP], -[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl -# Installed binaries are usually stripped using 'strip' when the user -# run "make install-strip". However 'strip' might not be the right -# tool to use in cross-compilation environments, therefore Automake -# will honor the 'STRIP' environment variable to overrule this program. -dnl Don't test for $cross_compiling = yes, because it might be 'maybe'. -if test "$cross_compiling" != no; then - AC_CHECK_TOOL([STRIP], [strip], :) -fi -INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" -AC_SUBST([INSTALL_STRIP_PROGRAM])]) - -# Copyright (C) 2006-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_SUBST_NOTMAKE(VARIABLE) -# --------------------------- -# Prevent Automake from outputting VARIABLE = @VARIABLE@ in Makefile.in. -# This macro is traced by Automake. -AC_DEFUN([_AM_SUBST_NOTMAKE]) - -# AM_SUBST_NOTMAKE(VARIABLE) -# -------------------------- -# Public sister of _AM_SUBST_NOTMAKE. -AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) - -# Check how to create a tarball. -*- Autoconf -*- - -# Copyright (C) 2004-2013 Free Software Foundation, Inc. -# -# This file is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# _AM_PROG_TAR(FORMAT) -# -------------------- -# Check how to create a tarball in format FORMAT. -# FORMAT should be one of 'v7', 'ustar', or 'pax'. -# -# Substitute a variable $(am__tar) that is a command -# writing to stdout a FORMAT-tarball containing the directory -# $tardir. -# tardir=directory && $(am__tar) > result.tar -# -# Substitute a variable $(am__untar) that extract such -# a tarball read from stdin. -# $(am__untar) < result.tar -# -AC_DEFUN([_AM_PROG_TAR], -[# Always define AMTAR for backward compatibility. Yes, it's still used -# in the wild :-( We should find a proper way to deprecate it ... -AC_SUBST([AMTAR], ['$${TAR-tar}']) - -# We'll loop over all known methods to create a tar archive until one works. -_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none' - -m4_if([$1], [v7], - [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'], - - [m4_case([$1], - [ustar], - [# The POSIX 1988 'ustar' format is defined with fixed-size fields. - # There is notably a 21 bits limit for the UID and the GID. In fact, - # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343 - # and bug#13588). - am_max_uid=2097151 # 2^21 - 1 - am_max_gid=$am_max_uid - # The $UID and $GID variables are not portable, so we need to resort - # to the POSIX-mandated id(1) utility. Errors in the 'id' calls - # below are definitely unexpected, so allow the users to see them - # (that is, avoid stderr redirection). - am_uid=`id -u || echo unknown` - am_gid=`id -g || echo unknown` - AC_MSG_CHECKING([whether UID '$am_uid' is supported by ustar format]) - if test $am_uid -le $am_max_uid; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - _am_tools=none - fi - AC_MSG_CHECKING([whether GID '$am_gid' is supported by ustar format]) - if test $am_gid -le $am_max_gid; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - _am_tools=none - fi], - - [pax], - [], - - [m4_fatal([Unknown tar format])]) - - AC_MSG_CHECKING([how to create a $1 tar archive]) - - # Go ahead even if we have the value already cached. We do so because we - # need to set the values for the 'am__tar' and 'am__untar' variables. - _am_tools=${am_cv_prog_tar_$1-$_am_tools} - - for _am_tool in $_am_tools; do - case $_am_tool in - gnutar) - for _am_tar in tar gnutar gtar; do - AM_RUN_LOG([$_am_tar --version]) && break - done - am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"' - am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"' - am__untar="$_am_tar -xf -" - ;; - plaintar) - # Must skip GNU tar: if it does not support --format= it doesn't create - # ustar tarball either. - (tar --version) >/dev/null 2>&1 && continue - am__tar='tar chf - "$$tardir"' - am__tar_='tar chf - "$tardir"' - am__untar='tar xf -' - ;; - pax) - am__tar='pax -L -x $1 -w "$$tardir"' - am__tar_='pax -L -x $1 -w "$tardir"' - am__untar='pax -r' - ;; - cpio) - am__tar='find "$$tardir" -print | cpio -o -H $1 -L' - am__tar_='find "$tardir" -print | cpio -o -H $1 -L' - am__untar='cpio -i -H $1 -d' - ;; - none) - am__tar=false - am__tar_=false - am__untar=false - ;; - esac - - # If the value was cached, stop now. We just wanted to have am__tar - # and am__untar set. - test -n "${am_cv_prog_tar_$1}" && break - - # tar/untar a dummy directory, and stop if the command works. - rm -rf conftest.dir - mkdir conftest.dir - echo GrepMe > conftest.dir/file - AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar]) - rm -rf conftest.dir - if test -s conftest.tar; then - AM_RUN_LOG([$am__untar /dev/null 2>&1 && break - fi - done - rm -rf conftest.dir - - AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool]) - AC_MSG_RESULT([$am_cv_prog_tar_$1])]) - -AC_SUBST([am__tar]) -AC_SUBST([am__untar]) -]) # _AM_PROG_TAR - +# generated automatically by aclocal 1.14.1 -*- Autoconf -*- + +# Copyright (C) 1996-2013 Free Software Foundation, Inc. + +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])]) +m4_ifndef([AC_AUTOCONF_VERSION], + [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl +m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],, +[m4_warning([this file was generated for autoconf 2.69. +You have another version of autoconf. It may work, but is not guaranteed to. +If you have problems, you may need to regenerate the build system entirely. +To do so, use the procedure documented by the package, typically 'autoreconf'.])]) + +# Copyright (C) 2002-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_AUTOMAKE_VERSION(VERSION) +# ---------------------------- +# Automake X.Y traces this macro to ensure aclocal.m4 has been +# generated from the m4 files accompanying Automake X.Y. +# (This private macro should not be called outside this file.) +AC_DEFUN([AM_AUTOMAKE_VERSION], +[am__api_version='1.14' +dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to +dnl require some minimum version. Point them to the right macro. +m4_if([$1], [1.14.1], [], + [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl +]) + +# _AM_AUTOCONF_VERSION(VERSION) +# ----------------------------- +# aclocal traces this macro to find the Autoconf version. +# This is a private macro too. Using m4_define simplifies +# the logic in aclocal, which can simply ignore this definition. +m4_define([_AM_AUTOCONF_VERSION], []) + +# AM_SET_CURRENT_AUTOMAKE_VERSION +# ------------------------------- +# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. +# This function is AC_REQUIREd by AM_INIT_AUTOMAKE. +AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], +[AM_AUTOMAKE_VERSION([1.14.1])dnl +m4_ifndef([AC_AUTOCONF_VERSION], + [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl +_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) + +# AM_AUX_DIR_EXPAND -*- Autoconf -*- + +# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets +# $ac_aux_dir to '$srcdir/foo'. In other projects, it is set to +# '$srcdir', '$srcdir/..', or '$srcdir/../..'. +# +# Of course, Automake must honor this variable whenever it calls a +# tool from the auxiliary directory. The problem is that $srcdir (and +# therefore $ac_aux_dir as well) can be either absolute or relative, +# depending on how configure is run. This is pretty annoying, since +# it makes $ac_aux_dir quite unusable in subdirectories: in the top +# source directory, any form will work fine, but in subdirectories a +# relative path needs to be adjusted first. +# +# $ac_aux_dir/missing +# fails when called from a subdirectory if $ac_aux_dir is relative +# $top_srcdir/$ac_aux_dir/missing +# fails if $ac_aux_dir is absolute, +# fails when called from a subdirectory in a VPATH build with +# a relative $ac_aux_dir +# +# The reason of the latter failure is that $top_srcdir and $ac_aux_dir +# are both prefixed by $srcdir. In an in-source build this is usually +# harmless because $srcdir is '.', but things will broke when you +# start a VPATH build or use an absolute $srcdir. +# +# So we could use something similar to $top_srcdir/$ac_aux_dir/missing, +# iff we strip the leading $srcdir from $ac_aux_dir. That would be: +# am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"` +# and then we would define $MISSING as +# MISSING="\${SHELL} $am_aux_dir/missing" +# This will work as long as MISSING is not called from configure, because +# unfortunately $(top_srcdir) has no meaning in configure. +# However there are other variables, like CC, which are often used in +# configure, and could therefore not use this "fixed" $ac_aux_dir. +# +# Another solution, used here, is to always expand $ac_aux_dir to an +# absolute PATH. The drawback is that using absolute paths prevent a +# configured tree to be moved without reconfiguration. + +AC_DEFUN([AM_AUX_DIR_EXPAND], +[dnl Rely on autoconf to set up CDPATH properly. +AC_PREREQ([2.50])dnl +# expand $ac_aux_dir to an absolute path +am_aux_dir=`cd $ac_aux_dir && pwd` +]) + +# AM_CONDITIONAL -*- Autoconf -*- + +# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_CONDITIONAL(NAME, SHELL-CONDITION) +# ------------------------------------- +# Define a conditional. +AC_DEFUN([AM_CONDITIONAL], +[AC_PREREQ([2.52])dnl + m4_if([$1], [TRUE], [AC_FATAL([$0: invalid condition: $1])], + [$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl +AC_SUBST([$1_TRUE])dnl +AC_SUBST([$1_FALSE])dnl +_AM_SUBST_NOTMAKE([$1_TRUE])dnl +_AM_SUBST_NOTMAKE([$1_FALSE])dnl +m4_define([_AM_COND_VALUE_$1], [$2])dnl +if $2; then + $1_TRUE= + $1_FALSE='#' +else + $1_TRUE='#' + $1_FALSE= +fi +AC_CONFIG_COMMANDS_PRE( +[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then + AC_MSG_ERROR([[conditional "$1" was never defined. +Usually this means the macro was only invoked conditionally.]]) +fi])]) + +# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + + +# There are a few dirty hacks below to avoid letting 'AC_PROG_CC' be +# written in clear, in which case automake, when reading aclocal.m4, +# will think it sees a *use*, and therefore will trigger all it's +# C support machinery. Also note that it means that autoscan, seeing +# CC etc. in the Makefile, will ask for an AC_PROG_CC use... + + +# _AM_DEPENDENCIES(NAME) +# ---------------------- +# See how the compiler implements dependency checking. +# NAME is "CC", "CXX", "OBJC", "OBJCXX", "UPC", or "GJC". +# We try a few techniques and use that to set a single cache variable. +# +# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was +# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular +# dependency, and given that the user is not expected to run this macro, +# just rely on AC_PROG_CC. +AC_DEFUN([_AM_DEPENDENCIES], +[AC_REQUIRE([AM_SET_DEPDIR])dnl +AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl +AC_REQUIRE([AM_MAKE_INCLUDE])dnl +AC_REQUIRE([AM_DEP_TRACK])dnl + +m4_if([$1], [CC], [depcc="$CC" am_compiler_list=], + [$1], [CXX], [depcc="$CXX" am_compiler_list=], + [$1], [OBJC], [depcc="$OBJC" am_compiler_list='gcc3 gcc'], + [$1], [OBJCXX], [depcc="$OBJCXX" am_compiler_list='gcc3 gcc'], + [$1], [UPC], [depcc="$UPC" am_compiler_list=], + [$1], [GCJ], [depcc="$GCJ" am_compiler_list='gcc3 gcc'], + [depcc="$$1" am_compiler_list=]) + +AC_CACHE_CHECK([dependency style of $depcc], + [am_cv_$1_dependencies_compiler_type], +[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then + # We make a subdir and do the tests there. Otherwise we can end up + # making bogus files that we don't know about and never remove. For + # instance it was reported that on HP-UX the gcc test will end up + # making a dummy file named 'D' -- because '-MD' means "put the output + # in D". + rm -rf conftest.dir + mkdir conftest.dir + # Copy depcomp to subdir because otherwise we won't find it if we're + # using a relative directory. + cp "$am_depcomp" conftest.dir + cd conftest.dir + # We will build objects and dependencies in a subdirectory because + # it helps to detect inapplicable dependency modes. For instance + # both Tru64's cc and ICC support -MD to output dependencies as a + # side effect of compilation, but ICC will put the dependencies in + # the current directory while Tru64 will put them in the object + # directory. + mkdir sub + + am_cv_$1_dependencies_compiler_type=none + if test "$am_compiler_list" = ""; then + am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp` + fi + am__universal=false + m4_case([$1], [CC], + [case " $depcc " in #( + *\ -arch\ *\ -arch\ *) am__universal=true ;; + esac], + [CXX], + [case " $depcc " in #( + *\ -arch\ *\ -arch\ *) am__universal=true ;; + esac]) + + for depmode in $am_compiler_list; do + # Setup a source with many dependencies, because some compilers + # like to wrap large dependency lists on column 80 (with \), and + # we should not choose a depcomp mode which is confused by this. + # + # We need to recreate these files for each test, as the compiler may + # overwrite some of them when testing with obscure command lines. + # This happens at least with the AIX C compiler. + : > sub/conftest.c + for i in 1 2 3 4 5 6; do + echo '#include "conftst'$i'.h"' >> sub/conftest.c + # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with + # Solaris 10 /bin/sh. + echo '/* dummy */' > sub/conftst$i.h + done + echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf + + # We check with '-c' and '-o' for the sake of the "dashmstdout" + # mode. It turns out that the SunPro C++ compiler does not properly + # handle '-M -o', and we need to detect this. Also, some Intel + # versions had trouble with output in subdirs. + am__obj=sub/conftest.${OBJEXT-o} + am__minus_obj="-o $am__obj" + case $depmode in + gcc) + # This depmode causes a compiler race in universal mode. + test "$am__universal" = false || continue + ;; + nosideeffect) + # After this tag, mechanisms are not by side-effect, so they'll + # only be used when explicitly requested. + if test "x$enable_dependency_tracking" = xyes; then + continue + else + break + fi + ;; + msvc7 | msvc7msys | msvisualcpp | msvcmsys) + # This compiler won't grok '-c -o', but also, the minuso test has + # not run yet. These depmodes are late enough in the game, and + # so weak that their functioning should not be impacted. + am__obj=conftest.${OBJEXT-o} + am__minus_obj= + ;; + none) break ;; + esac + if depmode=$depmode \ + source=sub/conftest.c object=$am__obj \ + depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ + $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \ + >/dev/null 2>conftest.err && + grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 && + grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && + grep $am__obj sub/conftest.Po > /dev/null 2>&1 && + ${MAKE-make} -s -f confmf > /dev/null 2>&1; then + # icc doesn't choke on unknown options, it will just issue warnings + # or remarks (even with -Werror). So we grep stderr for any message + # that says an option was ignored or not supported. + # When given -MP, icc 7.0 and 7.1 complain thusly: + # icc: Command line warning: ignoring option '-M'; no argument required + # The diagnosis changed in icc 8.0: + # icc: Command line remark: option '-MP' not supported + if (grep 'ignoring option' conftest.err || + grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else + am_cv_$1_dependencies_compiler_type=$depmode + break + fi + fi + done + + cd .. + rm -rf conftest.dir +else + am_cv_$1_dependencies_compiler_type=none +fi +]) +AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type]) +AM_CONDITIONAL([am__fastdep$1], [ + test "x$enable_dependency_tracking" != xno \ + && test "$am_cv_$1_dependencies_compiler_type" = gcc3]) +]) + + +# AM_SET_DEPDIR +# ------------- +# Choose a directory name for dependency files. +# This macro is AC_REQUIREd in _AM_DEPENDENCIES. +AC_DEFUN([AM_SET_DEPDIR], +[AC_REQUIRE([AM_SET_LEADING_DOT])dnl +AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl +]) + + +# AM_DEP_TRACK +# ------------ +AC_DEFUN([AM_DEP_TRACK], +[AC_ARG_ENABLE([dependency-tracking], [dnl +AS_HELP_STRING( + [--enable-dependency-tracking], + [do not reject slow dependency extractors]) +AS_HELP_STRING( + [--disable-dependency-tracking], + [speeds up one-time build])]) +if test "x$enable_dependency_tracking" != xno; then + am_depcomp="$ac_aux_dir/depcomp" + AMDEPBACKSLASH='\' + am__nodep='_no' +fi +AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno]) +AC_SUBST([AMDEPBACKSLASH])dnl +_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl +AC_SUBST([am__nodep])dnl +_AM_SUBST_NOTMAKE([am__nodep])dnl +]) + +# Generate code to set up dependency tracking. -*- Autoconf -*- + +# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + + +# _AM_OUTPUT_DEPENDENCY_COMMANDS +# ------------------------------ +AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS], +[{ + # Older Autoconf quotes --file arguments for eval, but not when files + # are listed without --file. Let's play safe and only enable the eval + # if we detect the quoting. + case $CONFIG_FILES in + *\'*) eval set x "$CONFIG_FILES" ;; + *) set x $CONFIG_FILES ;; + esac + shift + for mf + do + # Strip MF so we end up with the name of the file. + mf=`echo "$mf" | sed -e 's/:.*$//'` + # Check whether this is an Automake generated Makefile or not. + # We used to match only the files named 'Makefile.in', but + # some people rename them; so instead we look at the file content. + # Grep'ing the first line is not enough: some people post-process + # each Makefile.in and add a new line on top of each file to say so. + # Grep'ing the whole file is not good either: AIX grep has a line + # limit of 2048, but all sed's we know have understand at least 4000. + if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then + dirpart=`AS_DIRNAME("$mf")` + else + continue + fi + # Extract the definition of DEPDIR, am__include, and am__quote + # from the Makefile without running 'make'. + DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"` + test -z "$DEPDIR" && continue + am__include=`sed -n 's/^am__include = //p' < "$mf"` + test -z "$am__include" && continue + am__quote=`sed -n 's/^am__quote = //p' < "$mf"` + # Find all dependency output files, they are included files with + # $(DEPDIR) in their names. We invoke sed twice because it is the + # simplest approach to changing $(DEPDIR) to its actual value in the + # expansion. + for file in `sed -n " + s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \ + sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g'`; do + # Make sure the directory exists. + test -f "$dirpart/$file" && continue + fdir=`AS_DIRNAME(["$file"])` + AS_MKDIR_P([$dirpart/$fdir]) + # echo "creating $dirpart/$file" + echo '# dummy' > "$dirpart/$file" + done + done +} +])# _AM_OUTPUT_DEPENDENCY_COMMANDS + + +# AM_OUTPUT_DEPENDENCY_COMMANDS +# ----------------------------- +# This macro should only be invoked once -- use via AC_REQUIRE. +# +# This code is only required when automatic dependency tracking +# is enabled. FIXME. This creates each '.P' file that we will +# need in order to bootstrap the dependency handling code. +AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], +[AC_CONFIG_COMMANDS([depfiles], + [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS], + [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"]) +]) + +# Do all the work for Automake. -*- Autoconf -*- + +# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This macro actually does too much. Some checks are only needed if +# your package does certain things. But this isn't really a big deal. + +dnl Redefine AC_PROG_CC to automatically invoke _AM_PROG_CC_C_O. +m4_define([AC_PROG_CC], +m4_defn([AC_PROG_CC]) +[_AM_PROG_CC_C_O +]) + +# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE]) +# AM_INIT_AUTOMAKE([OPTIONS]) +# ----------------------------------------------- +# The call with PACKAGE and VERSION arguments is the old style +# call (pre autoconf-2.50), which is being phased out. PACKAGE +# and VERSION should now be passed to AC_INIT and removed from +# the call to AM_INIT_AUTOMAKE. +# We support both call styles for the transition. After +# the next Automake release, Autoconf can make the AC_INIT +# arguments mandatory, and then we can depend on a new Autoconf +# release and drop the old call support. +AC_DEFUN([AM_INIT_AUTOMAKE], +[AC_PREREQ([2.65])dnl +dnl Autoconf wants to disallow AM_ names. We explicitly allow +dnl the ones we care about. +m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl +AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl +AC_REQUIRE([AC_PROG_INSTALL])dnl +if test "`cd $srcdir && pwd`" != "`pwd`"; then + # Use -I$(srcdir) only when $(srcdir) != ., so that make's output + # is not polluted with repeated "-I." + AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl + # test to see if srcdir already configured + if test -f $srcdir/config.status; then + AC_MSG_ERROR([source directory already configured; run "make distclean" there first]) + fi +fi + +# test whether we have cygpath +if test -z "$CYGPATH_W"; then + if (cygpath --version) >/dev/null 2>/dev/null; then + CYGPATH_W='cygpath -w' + else + CYGPATH_W=echo + fi +fi +AC_SUBST([CYGPATH_W]) + +# Define the identity of the package. +dnl Distinguish between old-style and new-style calls. +m4_ifval([$2], +[AC_DIAGNOSE([obsolete], + [$0: two- and three-arguments forms are deprecated.]) +m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl + AC_SUBST([PACKAGE], [$1])dnl + AC_SUBST([VERSION], [$2])], +[_AM_SET_OPTIONS([$1])dnl +dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT. +m4_if( + m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]), + [ok:ok],, + [m4_fatal([AC_INIT should be called with package and version arguments])])dnl + AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl + AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl + +_AM_IF_OPTION([no-define],, +[AC_DEFINE_UNQUOTED([PACKAGE], ["$PACKAGE"], [Name of package]) + AC_DEFINE_UNQUOTED([VERSION], ["$VERSION"], [Version number of package])])dnl + +# Some tools Automake needs. +AC_REQUIRE([AM_SANITY_CHECK])dnl +AC_REQUIRE([AC_ARG_PROGRAM])dnl +AM_MISSING_PROG([ACLOCAL], [aclocal-${am__api_version}]) +AM_MISSING_PROG([AUTOCONF], [autoconf]) +AM_MISSING_PROG([AUTOMAKE], [automake-${am__api_version}]) +AM_MISSING_PROG([AUTOHEADER], [autoheader]) +AM_MISSING_PROG([MAKEINFO], [makeinfo]) +AC_REQUIRE([AM_PROG_INSTALL_SH])dnl +AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl +AC_REQUIRE([AC_PROG_MKDIR_P])dnl +# For better backward compatibility. To be removed once Automake 1.9.x +# dies out for good. For more background, see: +# +# +AC_SUBST([mkdir_p], ['$(MKDIR_P)']) +# We need awk for the "check" target. The system "awk" is bad on +# some platforms. +AC_REQUIRE([AC_PROG_AWK])dnl +AC_REQUIRE([AC_PROG_MAKE_SET])dnl +AC_REQUIRE([AM_SET_LEADING_DOT])dnl +_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])], + [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])], + [_AM_PROG_TAR([v7])])]) +_AM_IF_OPTION([no-dependencies],, +[AC_PROVIDE_IFELSE([AC_PROG_CC], + [_AM_DEPENDENCIES([CC])], + [m4_define([AC_PROG_CC], + m4_defn([AC_PROG_CC])[_AM_DEPENDENCIES([CC])])])dnl +AC_PROVIDE_IFELSE([AC_PROG_CXX], + [_AM_DEPENDENCIES([CXX])], + [m4_define([AC_PROG_CXX], + m4_defn([AC_PROG_CXX])[_AM_DEPENDENCIES([CXX])])])dnl +AC_PROVIDE_IFELSE([AC_PROG_OBJC], + [_AM_DEPENDENCIES([OBJC])], + [m4_define([AC_PROG_OBJC], + m4_defn([AC_PROG_OBJC])[_AM_DEPENDENCIES([OBJC])])])dnl +AC_PROVIDE_IFELSE([AC_PROG_OBJCXX], + [_AM_DEPENDENCIES([OBJCXX])], + [m4_define([AC_PROG_OBJCXX], + m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl +]) +AC_REQUIRE([AM_SILENT_RULES])dnl +dnl The testsuite driver may need to know about EXEEXT, so add the +dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This +dnl macro is hooked onto _AC_COMPILER_EXEEXT early, see below. +AC_CONFIG_COMMANDS_PRE(dnl +[m4_provide_if([_AM_COMPILER_EXEEXT], + [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl + +# POSIX will say in a future version that running "rm -f" with no argument +# is OK; and we want to be able to make that assumption in our Makefile +# recipes. So use an aggressive probe to check that the usage we want is +# actually supported "in the wild" to an acceptable degree. +# See automake bug#10828. +# To make any issue more visible, cause the running configure to be aborted +# by default if the 'rm' program in use doesn't match our expectations; the +# user can still override this though. +if rm -f && rm -fr && rm -rf; then : OK; else + cat >&2 <<'END' +Oops! + +Your 'rm' program seems unable to run without file operands specified +on the command line, even when the '-f' option is present. This is contrary +to the behaviour of most rm programs out there, and not conforming with +the upcoming POSIX standard: + +Please tell bug-automake@gnu.org about your system, including the value +of your $PATH and any error possibly output before this message. This +can help us improve future automake versions. + +END + if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then + echo 'Configuration will proceed anyway, since you have set the' >&2 + echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2 + echo >&2 + else + cat >&2 <<'END' +Aborting the configuration process, to ensure you take notice of the issue. + +You can download and install GNU coreutils to get an 'rm' implementation +that behaves properly: . + +If you want to complete the configuration process using your problematic +'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM +to "yes", and re-run configure. + +END + AC_MSG_ERROR([Your 'rm' program is bad, sorry.]) + fi +fi]) + +dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not +dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further +dnl mangled by Autoconf and run in a shell conditional statement. +m4_define([_AC_COMPILER_EXEEXT], +m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])]) + +# When config.status generates a header, we must update the stamp-h file. +# This file resides in the same directory as the config header +# that is generated. The stamp files are numbered to have different names. + +# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the +# loop where config.status creates the headers, so we can generate +# our stamp files there. +AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK], +[# Compute $1's index in $config_headers. +_am_arg=$1 +_am_stamp_count=1 +for _am_header in $config_headers :; do + case $_am_header in + $_am_arg | $_am_arg:* ) + break ;; + * ) + _am_stamp_count=`expr $_am_stamp_count + 1` ;; + esac +done +echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) + +# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_PROG_INSTALL_SH +# ------------------ +# Define $install_sh. +AC_DEFUN([AM_PROG_INSTALL_SH], +[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl +if test x"${install_sh}" != xset; then + case $am_aux_dir in + *\ * | *\ *) + install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; + *) + install_sh="\${SHELL} $am_aux_dir/install-sh" + esac +fi +AC_SUBST([install_sh])]) + +# Copyright (C) 2003-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# Check whether the underlying file-system supports filenames +# with a leading dot. For instance MS-DOS doesn't. +AC_DEFUN([AM_SET_LEADING_DOT], +[rm -rf .tst 2>/dev/null +mkdir .tst 2>/dev/null +if test -d .tst; then + am__leading_dot=. +else + am__leading_dot=_ +fi +rmdir .tst 2>/dev/null +AC_SUBST([am__leading_dot])]) + +# Check to see how 'make' treats includes. -*- Autoconf -*- + +# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_MAKE_INCLUDE() +# ----------------- +# Check to see how make treats includes. +AC_DEFUN([AM_MAKE_INCLUDE], +[am_make=${MAKE-make} +cat > confinc << 'END' +am__doit: + @echo this is the am__doit target +.PHONY: am__doit +END +# If we don't find an include directive, just comment out the code. +AC_MSG_CHECKING([for style of include used by $am_make]) +am__include="#" +am__quote= +_am_result=none +# First try GNU make style include. +echo "include confinc" > confmf +# Ignore all kinds of additional output from 'make'. +case `$am_make -s -f confmf 2> /dev/null` in #( +*the\ am__doit\ target*) + am__include=include + am__quote= + _am_result=GNU + ;; +esac +# Now try BSD make style include. +if test "$am__include" = "#"; then + echo '.include "confinc"' > confmf + case `$am_make -s -f confmf 2> /dev/null` in #( + *the\ am__doit\ target*) + am__include=.include + am__quote="\"" + _am_result=BSD + ;; + esac +fi +AC_SUBST([am__include]) +AC_SUBST([am__quote]) +AC_MSG_RESULT([$_am_result]) +rm -f confinc confmf +]) + +# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- + +# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_MISSING_PROG(NAME, PROGRAM) +# ------------------------------ +AC_DEFUN([AM_MISSING_PROG], +[AC_REQUIRE([AM_MISSING_HAS_RUN]) +$1=${$1-"${am_missing_run}$2"} +AC_SUBST($1)]) + +# AM_MISSING_HAS_RUN +# ------------------ +# Define MISSING if not defined so far and test if it is modern enough. +# If it is, set am_missing_run to use it, otherwise, to nothing. +AC_DEFUN([AM_MISSING_HAS_RUN], +[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl +AC_REQUIRE_AUX_FILE([missing])dnl +if test x"${MISSING+set}" != xset; then + case $am_aux_dir in + *\ * | *\ *) + MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;; + *) + MISSING="\${SHELL} $am_aux_dir/missing" ;; + esac +fi +# Use eval to expand $SHELL +if eval "$MISSING --is-lightweight"; then + am_missing_run="$MISSING " +else + am_missing_run= + AC_MSG_WARN(['missing' script is too old or missing]) +fi +]) + +# Helper functions for option handling. -*- Autoconf -*- + +# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# _AM_MANGLE_OPTION(NAME) +# ----------------------- +AC_DEFUN([_AM_MANGLE_OPTION], +[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])]) + +# _AM_SET_OPTION(NAME) +# -------------------- +# Set option NAME. Presently that only means defining a flag for this option. +AC_DEFUN([_AM_SET_OPTION], +[m4_define(_AM_MANGLE_OPTION([$1]), [1])]) + +# _AM_SET_OPTIONS(OPTIONS) +# ------------------------ +# OPTIONS is a space-separated list of Automake options. +AC_DEFUN([_AM_SET_OPTIONS], +[m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])]) + +# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET]) +# ------------------------------------------- +# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise. +AC_DEFUN([_AM_IF_OPTION], +[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])]) + +# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# _AM_PROG_CC_C_O +# --------------- +# Like AC_PROG_CC_C_O, but changed for automake. We rewrite AC_PROG_CC +# to automatically call this. +AC_DEFUN([_AM_PROG_CC_C_O], +[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl +AC_REQUIRE_AUX_FILE([compile])dnl +AC_LANG_PUSH([C])dnl +AC_CACHE_CHECK( + [whether $CC understands -c and -o together], + [am_cv_prog_cc_c_o], + [AC_LANG_CONFTEST([AC_LANG_PROGRAM([])]) + # Make sure it works both with $CC and with simple cc. + # Following AC_PROG_CC_C_O, we do the test twice because some + # compilers refuse to overwrite an existing .o file with -o, + # though they will create one. + am_cv_prog_cc_c_o=yes + for am_i in 1 2; do + if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \ + && test -f conftest2.$ac_objext; then + : OK + else + am_cv_prog_cc_c_o=no + break + fi + done + rm -f core conftest* + unset am_i]) +if test "$am_cv_prog_cc_c_o" != yes; then + # Losing compiler, so override with the script. + # FIXME: It is wrong to rewrite CC. + # But if we don't then we get into trouble of one sort or another. + # A longer-term fix would be to have automake use am__CC in this case, + # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)" + CC="$am_aux_dir/compile $CC" +fi +AC_LANG_POP([C])]) + +# For backward compatibility. +AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])]) + +# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_RUN_LOG(COMMAND) +# ------------------- +# Run COMMAND, save the exit status in ac_status, and log it. +# (This has been adapted from Autoconf's _AC_RUN_LOG macro.) +AC_DEFUN([AM_RUN_LOG], +[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD + ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD + (exit $ac_status); }]) + +# Check to make sure that the build environment is sane. -*- Autoconf -*- + +# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_SANITY_CHECK +# --------------- +AC_DEFUN([AM_SANITY_CHECK], +[AC_MSG_CHECKING([whether build environment is sane]) +# Reject unsafe characters in $srcdir or the absolute working directory +# name. Accept space and tab only in the latter. +am_lf=' +' +case `pwd` in + *[[\\\"\#\$\&\'\`$am_lf]]*) + AC_MSG_ERROR([unsafe absolute working directory name]);; +esac +case $srcdir in + *[[\\\"\#\$\&\'\`$am_lf\ \ ]]*) + AC_MSG_ERROR([unsafe srcdir value: '$srcdir']);; +esac + +# Do 'set' in a subshell so we don't clobber the current shell's +# arguments. Must try -L first in case configure is actually a +# symlink; some systems play weird games with the mod time of symlinks +# (eg FreeBSD returns the mod time of the symlink's containing +# directory). +if ( + am_has_slept=no + for am_try in 1 2; do + echo "timestamp, slept: $am_has_slept" > conftest.file + set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null` + if test "$[*]" = "X"; then + # -L didn't work. + set X `ls -t "$srcdir/configure" conftest.file` + fi + if test "$[*]" != "X $srcdir/configure conftest.file" \ + && test "$[*]" != "X conftest.file $srcdir/configure"; then + + # If neither matched, then we have a broken ls. This can happen + # if, for instance, CONFIG_SHELL is bash and it inherits a + # broken ls alias from the environment. This has actually + # happened. Such a system could not be considered "sane". + AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken + alias in your environment]) + fi + if test "$[2]" = conftest.file || test $am_try -eq 2; then + break + fi + # Just in case. + sleep 1 + am_has_slept=yes + done + test "$[2]" = conftest.file + ) +then + # Ok. + : +else + AC_MSG_ERROR([newly created file is older than distributed files! +Check your system clock]) +fi +AC_MSG_RESULT([yes]) +# If we didn't sleep, we still need to ensure time stamps of config.status and +# generated files are strictly newer. +am_sleep_pid= +if grep 'slept: no' conftest.file >/dev/null 2>&1; then + ( sleep 1 ) & + am_sleep_pid=$! +fi +AC_CONFIG_COMMANDS_PRE( + [AC_MSG_CHECKING([that generated files are newer than configure]) + if test -n "$am_sleep_pid"; then + # Hide warnings about reused PIDs. + wait $am_sleep_pid 2>/dev/null + fi + AC_MSG_RESULT([done])]) +rm -f conftest.file +]) + +# Copyright (C) 2009-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_SILENT_RULES([DEFAULT]) +# -------------------------- +# Enable less verbose build rules; with the default set to DEFAULT +# ("yes" being less verbose, "no" or empty being verbose). +AC_DEFUN([AM_SILENT_RULES], +[AC_ARG_ENABLE([silent-rules], [dnl +AS_HELP_STRING( + [--enable-silent-rules], + [less verbose build output (undo: "make V=1")]) +AS_HELP_STRING( + [--disable-silent-rules], + [verbose build output (undo: "make V=0")])dnl +]) +case $enable_silent_rules in @%:@ ((( + yes) AM_DEFAULT_VERBOSITY=0;; + no) AM_DEFAULT_VERBOSITY=1;; + *) AM_DEFAULT_VERBOSITY=m4_if([$1], [yes], [0], [1]);; +esac +dnl +dnl A few 'make' implementations (e.g., NonStop OS and NextStep) +dnl do not support nested variable expansions. +dnl See automake bug#9928 and bug#10237. +am_make=${MAKE-make} +AC_CACHE_CHECK([whether $am_make supports nested variables], + [am_cv_make_support_nested_variables], + [if AS_ECHO([['TRUE=$(BAR$(V)) +BAR0=false +BAR1=true +V=1 +am__doit: + @$(TRUE) +.PHONY: am__doit']]) | $am_make -f - >/dev/null 2>&1; then + am_cv_make_support_nested_variables=yes +else + am_cv_make_support_nested_variables=no +fi]) +if test $am_cv_make_support_nested_variables = yes; then + dnl Using '$V' instead of '$(V)' breaks IRIX make. + AM_V='$(V)' + AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)' +else + AM_V=$AM_DEFAULT_VERBOSITY + AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY +fi +AC_SUBST([AM_V])dnl +AM_SUBST_NOTMAKE([AM_V])dnl +AC_SUBST([AM_DEFAULT_V])dnl +AM_SUBST_NOTMAKE([AM_DEFAULT_V])dnl +AC_SUBST([AM_DEFAULT_VERBOSITY])dnl +AM_BACKSLASH='\' +AC_SUBST([AM_BACKSLASH])dnl +_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl +]) + +# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# AM_PROG_INSTALL_STRIP +# --------------------- +# One issue with vendor 'install' (even GNU) is that you can't +# specify the program used to strip binaries. This is especially +# annoying in cross-compiling environments, where the build's strip +# is unlikely to handle the host's binaries. +# Fortunately install-sh will honor a STRIPPROG variable, so we +# always use install-sh in "make install-strip", and initialize +# STRIPPROG with the value of the STRIP variable (set by the user). +AC_DEFUN([AM_PROG_INSTALL_STRIP], +[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl +# Installed binaries are usually stripped using 'strip' when the user +# run "make install-strip". However 'strip' might not be the right +# tool to use in cross-compilation environments, therefore Automake +# will honor the 'STRIP' environment variable to overrule this program. +dnl Don't test for $cross_compiling = yes, because it might be 'maybe'. +if test "$cross_compiling" != no; then + AC_CHECK_TOOL([STRIP], [strip], :) +fi +INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" +AC_SUBST([INSTALL_STRIP_PROGRAM])]) + +# Copyright (C) 2006-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# _AM_SUBST_NOTMAKE(VARIABLE) +# --------------------------- +# Prevent Automake from outputting VARIABLE = @VARIABLE@ in Makefile.in. +# This macro is traced by Automake. +AC_DEFUN([_AM_SUBST_NOTMAKE]) + +# AM_SUBST_NOTMAKE(VARIABLE) +# -------------------------- +# Public sister of _AM_SUBST_NOTMAKE. +AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) + +# Check how to create a tarball. -*- Autoconf -*- + +# Copyright (C) 2004-2013 Free Software Foundation, Inc. +# +# This file is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# _AM_PROG_TAR(FORMAT) +# -------------------- +# Check how to create a tarball in format FORMAT. +# FORMAT should be one of 'v7', 'ustar', or 'pax'. +# +# Substitute a variable $(am__tar) that is a command +# writing to stdout a FORMAT-tarball containing the directory +# $tardir. +# tardir=directory && $(am__tar) > result.tar +# +# Substitute a variable $(am__untar) that extract such +# a tarball read from stdin. +# $(am__untar) < result.tar +# +AC_DEFUN([_AM_PROG_TAR], +[# Always define AMTAR for backward compatibility. Yes, it's still used +# in the wild :-( We should find a proper way to deprecate it ... +AC_SUBST([AMTAR], ['$${TAR-tar}']) + +# We'll loop over all known methods to create a tar archive until one works. +_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none' + +m4_if([$1], [v7], + [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'], + + [m4_case([$1], + [ustar], + [# The POSIX 1988 'ustar' format is defined with fixed-size fields. + # There is notably a 21 bits limit for the UID and the GID. In fact, + # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343 + # and bug#13588). + am_max_uid=2097151 # 2^21 - 1 + am_max_gid=$am_max_uid + # The $UID and $GID variables are not portable, so we need to resort + # to the POSIX-mandated id(1) utility. Errors in the 'id' calls + # below are definitely unexpected, so allow the users to see them + # (that is, avoid stderr redirection). + am_uid=`id -u || echo unknown` + am_gid=`id -g || echo unknown` + AC_MSG_CHECKING([whether UID '$am_uid' is supported by ustar format]) + if test $am_uid -le $am_max_uid; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + _am_tools=none + fi + AC_MSG_CHECKING([whether GID '$am_gid' is supported by ustar format]) + if test $am_gid -le $am_max_gid; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + _am_tools=none + fi], + + [pax], + [], + + [m4_fatal([Unknown tar format])]) + + AC_MSG_CHECKING([how to create a $1 tar archive]) + + # Go ahead even if we have the value already cached. We do so because we + # need to set the values for the 'am__tar' and 'am__untar' variables. + _am_tools=${am_cv_prog_tar_$1-$_am_tools} + + for _am_tool in $_am_tools; do + case $_am_tool in + gnutar) + for _am_tar in tar gnutar gtar; do + AM_RUN_LOG([$_am_tar --version]) && break + done + am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"' + am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"' + am__untar="$_am_tar -xf -" + ;; + plaintar) + # Must skip GNU tar: if it does not support --format= it doesn't create + # ustar tarball either. + (tar --version) >/dev/null 2>&1 && continue + am__tar='tar chf - "$$tardir"' + am__tar_='tar chf - "$tardir"' + am__untar='tar xf -' + ;; + pax) + am__tar='pax -L -x $1 -w "$$tardir"' + am__tar_='pax -L -x $1 -w "$tardir"' + am__untar='pax -r' + ;; + cpio) + am__tar='find "$$tardir" -print | cpio -o -H $1 -L' + am__tar_='find "$tardir" -print | cpio -o -H $1 -L' + am__untar='cpio -i -H $1 -d' + ;; + none) + am__tar=false + am__tar_=false + am__untar=false + ;; + esac + + # If the value was cached, stop now. We just wanted to have am__tar + # and am__untar set. + test -n "${am_cv_prog_tar_$1}" && break + + # tar/untar a dummy directory, and stop if the command works. + rm -rf conftest.dir + mkdir conftest.dir + echo GrepMe > conftest.dir/file + AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar]) + rm -rf conftest.dir + if test -s conftest.tar; then + AM_RUN_LOG([$am__untar /dev/null 2>&1 && break + fi + done + rm -rf conftest.dir + + AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool]) + AC_MSG_RESULT([$am_cv_prog_tar_$1])]) + +AC_SUBST([am__tar]) +AC_SUBST([am__untar]) +]) # _AM_PROG_TAR + diff --git a/configure b/configure index 95a0fe3..5a46f87 100755 --- a/configure +++ b/configure @@ -4286,7 +4286,7 @@ CPPFLAGS="-I${AUX_INCLUDES} -I${INCLUDES}/ad3" # Check for c++0x compile flag. #AX_CHECK_COMPILE_FLAG([-std=c++0x], [CXXFLAGS="$CXXFLAGS -std=c++0x"]) AX_CXX_COMPILE_STDCXX_11 -CXXFLAGS="$CXXFLAGS -std=c++11" +CXXFLAGS="$CXXFLAGS -std=c++14" # Macros # This will make ILOG use STL libs @@ -6356,4 +6356,3 @@ if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} fi - diff --git a/configure.ac b/configure.ac index d7cefe7..6e38196 100644 --- a/configure.ac +++ b/configure.ac @@ -22,7 +22,7 @@ AC_SUBST(CPPFLAGS) # Check for c++0x compile flag. #AX_CHECK_COMPILE_FLAG([-std=c++0x], [CXXFLAGS="$CXXFLAGS -std=c++0x"]) AX_CXX_COMPILE_STDCXX_11 -CXXFLAGS="$CXXFLAGS -std=c++11" +CXXFLAGS="$CXXFLAGS -std=c++14" # Macros # This will make ILOG use STL libs diff --git a/deps/AD3-2.0.2_fix_for_MSVCv40.7z b/deps/AD3-2.0.2_fix_for_MSVCv40.7z new file mode 100644 index 0000000..a72e920 Binary files /dev/null and b/deps/AD3-2.0.2_fix_for_MSVCv40.7z differ diff --git a/deps/gflags-2.0_fix_for_MSVCv40.7z b/deps/gflags-2.0_fix_for_MSVCv40.7z new file mode 100644 index 0000000..4d06e4a Binary files /dev/null and b/deps/gflags-2.0_fix_for_MSVCv40.7z differ diff --git a/deps/glog-0.3.2_fix_for_MSVCv40.7z b/deps/glog-0.3.2_fix_for_MSVCv40.7z new file mode 100644 index 0000000..8aac174 Binary files /dev/null and b/deps/glog-0.3.2_fix_for_MSVCv40.7z differ diff --git a/deps/googletest_fix_for_MSVCv40.7z b/deps/googletest_fix_for_MSVCv40.7z new file mode 100644 index 0000000..9027d0d Binary files /dev/null and b/deps/googletest_fix_for_MSVCv40.7z differ diff --git a/libturboparser/Makefile b/libturboparser/Makefile index bad93e0..f456949 100644 --- a/libturboparser/Makefile +++ b/libturboparser/Makefile @@ -1,276 +1,282 @@ -UTIL = ../src/util -CLASSIFIER = ../src/classifier -SEQUENCE = ../src/sequence -TAGGER = ../src/tagger -ENTITYRECOGNIZER = ../src/entity_recognizer -PARSER = ../src/parser -SEMANTICPARSER = ../src/semantic_parser -COREFERENCERESOLVER = ../src/coreference_resolver -MORPHOLOGICALTAGGER = ../src/morphological_tagger -AUXLIBS = ../deps/local/lib -AUXINCLUDES = ../deps/local/include - -OBJS = TurboParserInterface.o CoreferenceDecoder.o CoreferenceDictionary.o CoreferenceFeatures.o CoreferenceDocumentNumeric.o CoreferenceSentenceNumeric.o Mention.o CoreferenceDocument.o CoreferenceSentence.o CoreferenceOptions.o CoreferencePart.o CoreferencePipe.o CoreferenceReader.o CoreferenceWriter.o SemanticDecoder.o SemanticDictionary.o SemanticFeatures.o SemanticInstanceNumeric.o SemanticInstance.o SemanticOptions.o SemanticPart.o SemanticPipe.o SemanticReader.o SemanticWriter.o DependencyDecoder.o DependencyDictionary.o DependencyFeatures.o DependencyInstance.o DependencyInstanceNumeric.o DependencyOptions.o DependencyPart.o DependencyPipe.o DependencyReader.o DependencyWriter.o TaggerDictionary.o TaggerFeatures.o TaggerOptions.o TaggerPipe.o MorphologicalDictionary.o MorphologicalFeatures.o MorphologicalInstance.o MorphologicalInstanceNumeric.o MorphologicalOptions.o MorphologicalPipe.o MorphologicalReader.o MorphologicalWriter.o EntityDictionary.o EntityFeatures.o EntityInstance.o EntityInstanceNumeric.o EntityOptions.o EntityPipe.o EntityReader.o EntityWriter.o SequenceDecoder.o SequenceDictionary.o SequenceInstance.o SequenceInstanceNumeric.o SequenceOptions.o SequencePart.o SequencePipe.o SequenceReader.o SequenceWriter.o TokenDictionary.o Alphabet.o Dictionary.o Options.o Parameters.o Pipe.o Reader.o Writer.o AlgUtils.o SerializationUtils.o StringUtils.o TimeUtils.o -CC = g++ -DEBUG = -g -INCLUDES = -I$(UTIL)/ -I$(CLASSIFIER) -I$(SEQUENCE) -I$(TAGGER) -I$(ENTITYRECOGNIZER) -I$(PARSER) -I$(SEMANTICPARSER) -I$(COREFERENCERESOLVER) -I$(MORPHOLOGICALTAGGER) -I$(AUXINCLUDES) -LIBS = -L/usr/local/lib/ -L$(AUXLIBS) -CFLAGS = -std=c++0x -O3 -Wall -Wno-sign-compare -c -fmessage-length=0 -fPIC $(INCLUDES) -LDFLAGS = -shared -LFLAGS = $(LIBS) -Wl,-whole-archive -lad3 -Wl,-no-whole-archive -lgflags -lglog - -all : libturboparser.a libturboparser.so - -libturboparser.a : $(OBJS) - ar rcs libturboparser.a $(OBJS) - -libturboparser.so : $(OBJS) - $(CC) -o libturboparser.so $(OBJS) $(LDFLAGS) $(LFLAGS) - -TurboParserInterface.o: TurboParserInterface.h TurboParserInterface.cpp $(TAGGER)/TaggerPipe.h $(ENTITYRECOGNIZER)/EntityPipe.h $(PARSER)/DependencyPipe.h $(SEMANTICPARSER)/SemanticPipe.h $(COREFERENCERESOLVER)/CoreferencePipe.h $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(UTIL)/Utils.h - $(CC) $(CFLAGS) TurboParserInterface.cpp - -##################### - -CoreferenceDecoder.o: $(COREFERENCERESOLVER)/CoreferenceDecoder.h $(COREFERENCERESOLVER)/CoreferenceDecoder.cpp $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferencePipe.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDecoder.cpp - -CoreferenceDictionary.o: $(COREFERENCERESOLVER)/CoreferenceDictionary.h $(COREFERENCERESOLVER)/CoreferenceDictionary.cpp $(COREFERENCERESOLVER)/CoreferencePipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDictionary.cpp - -CoreferenceFeatures.o: $(COREFERENCERESOLVER)/CoreferenceFeatures.h $(COREFERENCERESOLVER)/CoreferenceFeatures.cpp $(COREFERENCERESOLVER)/CoreferencePipe.h $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferenceFeatureTemplates.h $(CLASSIFIER)/Features.h $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.h $(CLASSIFIER)/FeatureEncoder.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceFeatures.cpp - -CoreferenceDocument.o: $(COREFERENCERESOLVER)/CoreferenceDocument.h $(COREFERENCERESOLVER)/CoreferenceDocument.cpp $(COREFERENCERESOLVER)/CoreferenceSentence.h $(CLASSIFIER)/Instance.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDocument.cpp - -CoreferenceSentence.o: $(COREFERENCERESOLVER)/CoreferenceSentence.h $(COREFERENCERESOLVER)/CoreferenceSentence.cpp $(CLASSIFIER)/Instance.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceSentence.cpp - -CoreferenceDocumentNumeric.o: $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.h $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.cpp $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.h $(COREFERENCERESOLVER)/CoreferenceDocument.h $(COREFERENCERESOLVER)/CoreferenceDictionary.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.cpp - -CoreferenceSentenceNumeric.o: $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.h $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.cpp $(COREFERENCERESOLVER)/CoreferenceSentence.h $(COREFERENCERESOLVER)/Mention.h $(COREFERENCERESOLVER)/CoreferenceDictionary.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.cpp - -Mention.o: $(COREFERENCERESOLVER)/Mention.h $(COREFERENCERESOLVER)/Mention.cpp $(COREFERENCERESOLVER)/CoreferenceDictionary.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/Mention.cpp - -CoreferenceOptions.o: $(COREFERENCERESOLVER)/CoreferenceOptions.h $(COREFERENCERESOLVER)/CoreferenceOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceOptions.cpp - -CoreferencePart.o: $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferencePart.cpp $(CLASSIFIER)/Part.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferencePart.cpp - -CoreferencePipe.o: $(COREFERENCERESOLVER)/CoreferencePipe.h $(COREFERENCERESOLVER)/CoreferencePipe.cpp $(CLASSIFIER)/Pipe.h $(COREFERENCERESOLVER)/CoreferenceOptions.h $(COREFERENCERESOLVER)/CoreferenceReader.h $(COREFERENCERESOLVER)/CoreferenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.h $(COREFERENCERESOLVER)/CoreferenceWriter.h $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferenceFeatures.h $(COREFERENCERESOLVER)/CoreferenceDecoder.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferencePipe.cpp - -CoreferenceReader.o: $(COREFERENCERESOLVER)/CoreferenceReader.h $(COREFERENCERESOLVER)/CoreferenceReader.cpp $(COREFERENCERESOLVER)/CoreferenceDocument.h $(CLASSIFIER)/Reader.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceReader.cpp - -CoreferenceWriter.o: $(COREFERENCERESOLVER)/CoreferenceWriter.h $(COREFERENCERESOLVER)/CoreferenceWriter.cpp $(COREFERENCERESOLVER)/CoreferenceDocument.h $(CLASSIFIER)/Writer.h - $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceWriter.cpp - -##################### - -SemanticDecoder.o: $(SEMANTICPARSER)/SemanticDecoder.h $(SEMANTICPARSER)/SemanticDecoder.cpp $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticPipe.h $(PARSER)/FactorTree.h $(SEMANTICPARSER)/FactorPredicateAutomaton.h $(SEMANTICPARSER)/FactorArgumentAutomaton.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticDecoder.cpp - -SemanticDictionary.o: $(SEMANTICPARSER)/SemanticDictionary.h $(SEMANTICPARSER)/SemanticDictionary.cpp $(SEMANTICPARSER)/SemanticPipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticDictionary.cpp - -SemanticFeatures.o: $(SEMANTICPARSER)/SemanticFeatures.h $(SEMANTICPARSER)/SemanticFeatures.cpp $(SEMANTICPARSER)/SemanticPipe.h $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticFeatureTemplates.h $(CLASSIFIER)/Features.h $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticFeatures.cpp - -SemanticInstance.o: $(SEMANTICPARSER)/SemanticInstance.h $(SEMANTICPARSER)/SemanticInstance.cpp $(CLASSIFIER)/Instance.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticInstance.cpp - -SemanticInstanceNumeric.o: $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(SEMANTICPARSER)/SemanticInstanceNumeric.cpp $(SEMANTICPARSER)/SemanticInstance.h $(SEMANTICPARSER)/SemanticDictionary.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticInstanceNumeric.cpp - -SemanticOptions.o: $(SEMANTICPARSER)/SemanticOptions.h $(SEMANTICPARSER)/SemanticOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticOptions.cpp - -SemanticPart.o: $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticPart.cpp $(CLASSIFIER)/Part.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticPart.cpp - -SemanticPipe.o: $(SEMANTICPARSER)/SemanticPipe.h $(SEMANTICPARSER)/SemanticPipe.cpp $(CLASSIFIER)/Pipe.h $(SEMANTICPARSER)/SemanticOptions.h $(SEMANTICPARSER)/SemanticReader.h $(SEMANTICPARSER)/SemanticDictionary.h $(SEQUENCE)/TokenDictionary.h $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(SEMANTICPARSER)/SemanticWriter.h $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticFeatures.h $(SEMANTICPARSER)/SemanticDecoder.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticPipe.cpp - -SemanticReader.o: $(SEMANTICPARSER)/SemanticReader.h $(SEMANTICPARSER)/SemanticReader.cpp $(SEMANTICPARSER)/SemanticInstance.h $(CLASSIFIER)/Reader.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticReader.cpp - -SemanticWriter.o: $(SEMANTICPARSER)/SemanticWriter.h $(SEMANTICPARSER)/SemanticWriter.cpp $(SEMANTICPARSER)/SemanticInstance.h $(CLASSIFIER)/Writer.h - $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticWriter.cpp - -##################### - -DependencyDecoder.o: $(PARSER)/DependencyDecoder.h $(PARSER)/DependencyDecoder.cpp $(PARSER)/DependencyPart.h $(PARSER)/DependencyPipe.h $(PARSER)/FactorTree.h $(PARSER)/FactorHeadAutomaton.h $(PARSER)/FactorGrandparentHeadAutomaton.h $(PARSER)/FactorTrigramHeadAutomaton.h $(PARSER)/FactorSequence.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h - $(CC) $(CFLAGS) $(PARSER)/DependencyDecoder.cpp - -DependencyDictionary.o: $(PARSER)/DependencyDictionary.h $(PARSER)/DependencyDictionary.cpp $(PARSER)/DependencyPipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h - $(CC) $(CFLAGS) $(PARSER)/DependencyDictionary.cpp - -DependencyFeatures.o: $(PARSER)/DependencyFeatures.h $(PARSER)/DependencyFeatures.cpp $(PARSER)/DependencyPipe.h $(PARSER)/DependencyPart.h $(PARSER)/DependencyFeatureTemplates.h $(CLASSIFIER)/Features.h $(PARSER)/DependencyInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h - $(CC) $(CFLAGS) $(PARSER)/DependencyFeatures.cpp - -DependencyInstance.o: $(PARSER)/DependencyInstance.h $(PARSER)/DependencyInstance.cpp $(CLASSIFIER)/Instance.h - $(CC) $(CFLAGS) $(PARSER)/DependencyInstance.cpp - -DependencyInstanceNumeric.o: $(PARSER)/DependencyInstanceNumeric.h $(PARSER)/DependencyInstanceNumeric.cpp $(PARSER)/DependencyInstance.h $(PARSER)/DependencyDictionary.h - $(CC) $(CFLAGS) $(PARSER)/DependencyInstanceNumeric.cpp - -DependencyOptions.o: $(PARSER)/DependencyOptions.h $(PARSER)/DependencyOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h - $(CC) $(CFLAGS) $(PARSER)/DependencyOptions.cpp - -DependencyPart.o: $(PARSER)/DependencyPart.h $(PARSER)/DependencyPart.cpp $(CLASSIFIER)/Part.h - $(CC) $(CFLAGS) $(PARSER)/DependencyPart.cpp - -DependencyPipe.o: $(PARSER)/DependencyPipe.h $(PARSER)/DependencyPipe.cpp $(CLASSIFIER)/Pipe.h $(PARSER)/DependencyOptions.h $(PARSER)/DependencyReader.h $(PARSER)/DependencyDictionary.h $(SEQUENCE)/TokenDictionary.h $(PARSER)/DependencyInstanceNumeric.h $(PARSER)/DependencyWriter.h $(PARSER)/DependencyPart.h $(PARSER)/DependencyFeatures.h $(PARSER)/DependencyDecoder.h - $(CC) $(CFLAGS) $(PARSER)/DependencyPipe.cpp - -DependencyReader.o: $(PARSER)/DependencyReader.h $(PARSER)/DependencyReader.cpp $(PARSER)/DependencyInstance.h $(CLASSIFIER)/Reader.h - $(CC) $(CFLAGS) $(PARSER)/DependencyReader.cpp - -DependencyWriter.o: $(PARSER)/DependencyWriter.h $(PARSER)/DependencyWriter.cpp $(PARSER)/DependencyInstance.h $(CLASSIFIER)/Writer.h - $(CC) $(CFLAGS) $(PARSER)/DependencyWriter.cpp - -##################### - -TaggerDictionary.o: $(TAGGER)/TaggerDictionary.h $(TAGGER)/TaggerDictionary.cpp $(TAGGER)/TaggerPipe.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h - $(CC) $(CFLAGS) $(TAGGER)/TaggerDictionary.cpp - -TaggerFeatures.o: $(TAGGER)/TaggerFeatures.h $(TAGGER)/TaggerFeatures.cpp $(TAGGER)/TaggerPipe.h $(TAGGER)/TaggerFeatureTemplates.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h - $(CC) $(CFLAGS) $(TAGGER)/TaggerFeatures.cpp - -TaggerOptions.o: $(TAGGER)/TaggerOptions.h $(TAGGER)/TaggerOptions.cpp $(UTIL)/SerializationUtils.h $(SEQUENCE)/SequenceOptions.h - $(CC) $(CFLAGS) $(TAGGER)/TaggerOptions.cpp - -TaggerPipe.o: $(TAGGER)/TaggerPipe.h $(TAGGER)/TaggerPipe.cpp $(SEQUENCE)/SequencePipe.h $(TAGGER)/TaggerOptions.h $(TAGGER)/TaggerDictionary.h - $(CC) $(CFLAGS) $(TAGGER)/TaggerPipe.cpp - -##################### - -MorphologicalDictionary.o: $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.h $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.cpp $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.cpp - -MorphologicalFeatures.o: $(MORPHOLOGICALTAGGER)/MorphologicalFeatures.h $(MORPHOLOGICALTAGGER)/MorphologicalFeatures.cpp $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(MORPHOLOGICALTAGGER)/MorphologicalFeatureTemplates.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalFeatures.cpp - -MorphologicalInstance.o: $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(MORPHOLOGICALTAGGER)/MorphologicalInstance.cpp $(SEQUENCE)/SequenceInstance.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalInstance.cpp - -MorphologicalInstanceNumeric.o: $(MORPHOLOGICALTAGGER)/MorphologicalInstanceNumeric.h $(MORPHOLOGICALTAGGER)/MorphologicalInstanceNumeric.cpp $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(SEQUENCE)/SequenceInstanceNumeric.h $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalInstanceNumeric.cpp - -MorphologicalOptions.o: $(MORPHOLOGICALTAGGER)/MorphologicalOptions.h $(MORPHOLOGICALTAGGER)/MorphologicalOptions.cpp $(UTIL)/SerializationUtils.h $(SEQUENCE)/SequenceOptions.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalOptions.cpp - -MorphologicalPipe.o: $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(MORPHOLOGICALTAGGER)/MorphologicalPipe.cpp $(SEQUENCE)/SequencePipe.h $(MORPHOLOGICALTAGGER)/MorphologicalOptions.h $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalPipe.cpp - -MorphologicalReader.o: $(MORPHOLOGICALTAGGER)/MorphologicalReader.h $(MORPHOLOGICALTAGGER)/MorphologicalReader.cpp $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(SEQUENCE)/SequenceReader.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalReader.cpp - -MorphologicalWriter.o: $(MORPHOLOGICALTAGGER)/MorphologicalWriter.h $(MORPHOLOGICALTAGGER)/MorphologicalWriter.cpp $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(SEQUENCE)/SequenceWriter.h - $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalWriter.cpp - -##################### - -EntityDictionary.o: $(ENTITYRECOGNIZER)/EntityDictionary.h $(ENTITYRECOGNIZER)/EntityDictionary.cpp $(ENTITYRECOGNIZER)/EntityPipe.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityDictionary.cpp - -EntityFeatures.o: $(ENTITYRECOGNIZER)/EntityFeatures.h $(ENTITYRECOGNIZER)/EntityFeatures.cpp $(ENTITYRECOGNIZER)/EntityPipe.h $(ENTITYRECOGNIZER)/EntityFeatureTemplates.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityFeatures.cpp - -EntityOptions.o: $(ENTITYRECOGNIZER)/EntityOptions.h $(ENTITYRECOGNIZER)/EntityOptions.cpp $(UTIL)/SerializationUtils.h $(SEQUENCE)/SequenceOptions.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityOptions.cpp - -EntityPipe.o: $(ENTITYRECOGNIZER)/EntityPipe.h $(ENTITYRECOGNIZER)/EntityPipe.cpp $(SEQUENCE)/SequencePipe.h $(ENTITYRECOGNIZER)/EntityOptions.h $(ENTITYRECOGNIZER)/EntityDictionary.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityPipe.cpp - -EntityReader.o: $(ENTITYRECOGNIZER)/EntityReader.h $(ENTITYRECOGNIZER)/EntityReader.cpp $(ENTITYRECOGNIZER)/EntityInstance.h $(SEQUENCE)/SequenceReader.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityReader.cpp - -EntityWriter.o: $(ENTITYRECOGNIZER)/EntityWriter.h $(ENTITYRECOGNIZER)/EntityWriter.cpp $(ENTITYRECOGNIZER)/EntityInstance.h $(SEQUENCE)/SequenceWriter.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityWriter.cpp - -EntityInstance.o: $(ENTITYRECOGNIZER)/EntityInstance.h $(ENTITYRECOGNIZER)/EntityInstance.cpp $(SEQUENCE)/SequenceInstance.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityInstance.cpp - -EntityInstanceNumeric.o: $(ENTITYRECOGNIZER)/EntityInstanceNumeric.h $(ENTITYRECOGNIZER)/EntityInstanceNumeric.cpp $(ENTITYRECOGNIZER)/EntityInstance.h $(SEQUENCE)/SequenceInstanceNumeric.h $(ENTITYRECOGNIZER)/EntityDictionary.h - $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityInstanceNumeric.cpp - -##################### - -SequenceDecoder.o: $(SEQUENCE)/SequenceDecoder.h $(SEQUENCE)/SequenceDecoder.cpp $(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequencePipe.h $(CLASSIFIER)/Decoder.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequenceDecoder.cpp - -SequenceDictionary.o: $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/SequenceDictionary.cpp $(SEQUENCE)/SequencePipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequenceDictionary.cpp - -SequenceInstance.o: $(SEQUENCE)/SequenceInstance.h $(SEQUENCE)/SequenceInstance.cpp $(CLASSIFIER)/Instance.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequenceInstance.cpp - -SequenceInstanceNumeric.o: $(SEQUENCE)/SequenceInstanceNumeric.h $(SEQUENCE)/SequenceInstanceNumeric.cpp $(SEQUENCE)/SequenceInstance.h $(SEQUENCE)/SequenceDictionary.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequenceInstanceNumeric.cpp - -SequenceOptions.o: $(SEQUENCE)/SequenceOptions.h $(SEQUENCE)/SequenceOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequenceOptions.cpp - -SequencePart.o: $(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequencePart.cpp $(CLASSIFIER)/Part.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequencePart.cpp - -SequencePipe.o: $(SEQUENCE)/SequencePipe.h $(SEQUENCE)/SequencePipe.cpp $(CLASSIFIER)/Pipe.h $(SEQUENCE)/SequenceOptions.h $(SEQUENCE)/SequenceReader.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(SEQUENCE)/SequenceInstanceNumeric.h $(SEQUENCE)/SequenceWriter.h $(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceDecoder.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequencePipe.cpp - -SequenceReader.o: $(SEQUENCE)/SequenceReader.h $(SEQUENCE)/SequenceReader.cpp $(SEQUENCE)/SequenceInstance.h $(CLASSIFIER)/Reader.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequenceReader.cpp - -SequenceWriter.o: $(SEQUENCE)/SequenceWriter.h $(SEQUENCE)/SequenceWriter.cpp $(SEQUENCE)/SequenceInstance.h $(CLASSIFIER)/Writer.h - $(CC) $(CFLAGS) $(SEQUENCE)/SequenceWriter.cpp - -TokenDictionary.o: $(SEQUENCE)/TokenDictionary.h $(SEQUENCE)/TokenDictionary.cpp $(CLASSIFIER)/Pipe.h $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Dictionary.h $(CLASSIFIER)/Alphabet.h $(SEQUENCE)/SequenceReader.h $(PARSER)/DependencyReader.h - $(CC) $(CFLAGS) $(SEQUENCE)/TokenDictionary.cpp - -##################### - -Alphabet.o: $(CLASSIFIER)/Alphabet.h $(CLASSIFIER)/Alphabet.cpp $(UTIL)/SerializationUtils.h $(UTIL)/Utils.h - $(CC) $(CFLAGS) $(CLASSIFIER)/Alphabet.cpp - -Dictionary.o: $(CLASSIFIER)/Dictionary.h $(CLASSIFIER)/Dictionary.cpp - $(CC) $(CFLAGS) $(CLASSIFIER)/Dictionary.cpp - -Options.o: $(CLASSIFIER)/Options.h $(CLASSIFIER)/Options.cpp - $(CC) $(CFLAGS) $(CLASSIFIER)/Options.cpp - -Parameters.o: $(CLASSIFIER)/Parameters.h $(CLASSIFIER)/Parameters.cpp $(CLASSIFIER)/Features.h $(CLASSIFIER)/SparseParameterVector.h $(CLASSIFIER)/SparseLabeledParameterVector.h $(UTIL)/Utils.h - $(CC) $(CFLAGS) $(CLASSIFIER)/Parameters.cpp - -Pipe.o: $(CLASSIFIER)/Pipe.h $(CLASSIFIER)/Pipe.cpp $(CLASSIFIER)/Dictionary.h $(CLASSIFIER)/Features.h $(CLASSIFIER)/Part.h $(CLASSIFIER)/Reader.h $(CLASSIFIER)/Writer.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Decoder.h $(CLASSIFIER)/Parameters.h $(UTIL)/AlgUtils.h - $(CC) $(CFLAGS) $(CLASSIFIER)/Pipe.cpp - -Reader.o: $(CLASSIFIER)/Reader.h $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/Instance.h $(UTIL)/Utils.h - $(CC) $(CFLAGS) $(CLASSIFIER)/Reader.cpp - -Writer.o: $(CLASSIFIER)/Writer.h $(CLASSIFIER)/Writer.cpp $(CLASSIFIER)/Instance.h $(UTIL)/Utils.h - $(CC) $(CFLAGS) $(CLASSIFIER)/Writer.cpp - -##################### - -AlgUtils.o: $(UTIL)/AlgUtils.h $(UTIL)/AlgUtils.cpp - $(CC) $(CFLAGS) $(UTIL)/AlgUtils.cpp - -SerializationUtils.o: $(UTIL)/SerializationUtils.h $(UTIL)/SerializationUtils.cpp - $(CC) $(CFLAGS) $(UTIL)/SerializationUtils.cpp - -StringUtils.o: $(UTIL)/StringUtils.h $(UTIL)/StringUtils.cpp - $(CC) $(CFLAGS) $(UTIL)/StringUtils.cpp - -TimeUtils.o: $(UTIL)/TimeUtils.h $(UTIL)/TimeUtils.cpp - $(CC) $(CFLAGS) $(UTIL)/TimeUtils.cpp - -##################### - -clean: - rm -f *.o *~ libturboparser.a libturboparser.so +UTIL = ../src/util +CLASSIFIER = ../src/classifier +SEQUENCE = ../src/sequence +TAGGER = ../src/tagger +ENTITYRECOGNIZER = ../src/entity_recognizer +PARSER = ../src/parser +SEMANTICPARSER = ../src/semantic_parser +COREFERENCERESOLVER = ../src/coreference_resolver +MORPHOLOGICALTAGGER = ../src/morphological_tagger +AUXLIBS = ../deps/local/lib +AUXINCLUDES = ../deps/local/include + +OBJS = TurboParserInterface.o CoreferenceDecoder.o CoreferenceDictionary.o CoreferenceFeatures.o CoreferenceDocumentNumeric.o CoreferenceSentenceNumeric.o Mention.o CoreferenceDocument.o CoreferenceSentence.o CoreferenceOptions.o CoreferencePart.o CoreferencePipe.o CoreferenceReader.o CoreferenceWriter.o SemanticDecoder.o SemanticDictionary.o SemanticFeatures.o SemanticInstanceNumeric.o SemanticInstance.o SemanticOptions.o SemanticPart.o SemanticPipe.o SemanticReader.o SemanticWriter.o DependencyDecoder.o DependencyDictionary.o DependencyFeatures.o DependencyInstance.o DependencyInstanceNumeric.o DependencyOptions.o DependencyPart.o DependencyPipe.o DependencyReader.o DependencyWriter.o TaggerDictionary.o TaggerFeatures.o TaggerOptions.o TaggerPipe.o MorphologicalDictionary.o MorphologicalFeatures.o MorphologicalInstance.o MorphologicalInstanceNumeric.o MorphologicalOptions.o MorphologicalPipe.o MorphologicalReader.o MorphologicalWriter.o EntityDictionary.o EntityFeatures.o EntityInstance.o EntityInstanceNumeric.o EntityOptions.o EntityPipe.o EntityReader.o EntityWriter.o EntityDecoder.o SequenceDecoder.o SequenceDictionary.o SequenceInstance.o SequenceInstanceNumeric.o SequenceOptions.o SequencePart.o SequencePipe.o SequenceReader.o SequenceWriter.o TokenDictionary.o Alphabet.o Dictionary.o Options.o Parameters.o Pipe.o Reader.o Writer.o AlgUtils.o SerializationUtils.o StringUtils.o TimeUtils.o +CC = g++ +DEBUG = -g +INCLUDES = -I$(UTIL)/ -I$(CLASSIFIER) -I$(SEQUENCE) -I$(TAGGER) -I$(ENTITYRECOGNIZER) -I$(PARSER) -I$(SEMANTICPARSER) -I$(COREFERENCERESOLVER) -I$(MORPHOLOGICALTAGGER) -I$(AUXINCLUDES) +LIBS = -L/usr/local/lib/ -L$(AUXLIBS) + +BYPASSINIT_GLOG_D = + +CFLAGS = -std=gnu++14 -std=c++14 -O3 -Wall -Wno-sign-compare -c -fmessage-length=0 -fPIC $(BYPASSINIT_GLOG_D) $(INCLUDES) +LDFLAGS = -shared +LFLAGS = $(LIBS) -Wl,-whole-archive -lad3 -Wl,-no-whole-archive -lgflags -lglog + +all : libturboparser.a libturboparser.so + +libturboparser.a : $(OBJS) + ar rcs libturboparser.a $(OBJS) + +libturboparser.so : $(OBJS) + $(CC) -o libturboparser.so $(OBJS) $(LDFLAGS) $(LFLAGS) + +TurboParserInterface.o: TurboParserInterface.h TurboParserInterface.cpp $(TAGGER)/TaggerPipe.h $(ENTITYRECOGNIZER)/EntityPipe.h $(PARSER)/DependencyPipe.h $(SEMANTICPARSER)/SemanticPipe.h $(COREFERENCERESOLVER)/CoreferencePipe.h $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(UTIL)/Utils.h + $(CC) $(CFLAGS) TurboParserInterface.cpp + +##################### + +CoreferenceDecoder.o: $(COREFERENCERESOLVER)/CoreferenceDecoder.h $(COREFERENCERESOLVER)/CoreferenceDecoder.cpp $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferencePipe.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDecoder.cpp + +CoreferenceDictionary.o: $(COREFERENCERESOLVER)/CoreferenceDictionary.h $(COREFERENCERESOLVER)/CoreferenceDictionary.cpp $(COREFERENCERESOLVER)/CoreferencePipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDictionary.cpp + +CoreferenceFeatures.o: $(COREFERENCERESOLVER)/CoreferenceFeatures.h $(COREFERENCERESOLVER)/CoreferenceFeatures.cpp $(COREFERENCERESOLVER)/CoreferencePipe.h $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferenceFeatureTemplates.h $(CLASSIFIER)/Features.h $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.h $(CLASSIFIER)/FeatureEncoder.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceFeatures.cpp + +CoreferenceDocument.o: $(COREFERENCERESOLVER)/CoreferenceDocument.h $(COREFERENCERESOLVER)/CoreferenceDocument.cpp $(COREFERENCERESOLVER)/CoreferenceSentence.h $(CLASSIFIER)/Instance.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDocument.cpp + +CoreferenceSentence.o: $(COREFERENCERESOLVER)/CoreferenceSentence.h $(COREFERENCERESOLVER)/CoreferenceSentence.cpp $(CLASSIFIER)/Instance.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceSentence.cpp + +CoreferenceDocumentNumeric.o: $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.h $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.cpp $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.h $(COREFERENCERESOLVER)/CoreferenceDocument.h $(COREFERENCERESOLVER)/CoreferenceDictionary.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.cpp + +CoreferenceSentenceNumeric.o: $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.h $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.cpp $(COREFERENCERESOLVER)/CoreferenceSentence.h $(COREFERENCERESOLVER)/Mention.h $(COREFERENCERESOLVER)/CoreferenceDictionary.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceSentenceNumeric.cpp + +Mention.o: $(COREFERENCERESOLVER)/Mention.h $(COREFERENCERESOLVER)/Mention.cpp $(COREFERENCERESOLVER)/CoreferenceDictionary.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/Mention.cpp + +CoreferenceOptions.o: $(COREFERENCERESOLVER)/CoreferenceOptions.h $(COREFERENCERESOLVER)/CoreferenceOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceOptions.cpp + +CoreferencePart.o: $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferencePart.cpp $(CLASSIFIER)/Part.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferencePart.cpp + +CoreferencePipe.o: $(COREFERENCERESOLVER)/CoreferencePipe.h $(COREFERENCERESOLVER)/CoreferencePipe.cpp $(CLASSIFIER)/Pipe.h $(COREFERENCERESOLVER)/CoreferenceOptions.h $(COREFERENCERESOLVER)/CoreferenceReader.h $(COREFERENCERESOLVER)/CoreferenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(COREFERENCERESOLVER)/CoreferenceDocumentNumeric.h $(COREFERENCERESOLVER)/CoreferenceWriter.h $(COREFERENCERESOLVER)/CoreferencePart.h $(COREFERENCERESOLVER)/CoreferenceFeatures.h $(COREFERENCERESOLVER)/CoreferenceDecoder.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferencePipe.cpp + +CoreferenceReader.o: $(COREFERENCERESOLVER)/CoreferenceReader.h $(COREFERENCERESOLVER)/CoreferenceReader.cpp $(COREFERENCERESOLVER)/CoreferenceDocument.h $(CLASSIFIER)/Reader.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceReader.cpp + +CoreferenceWriter.o: $(COREFERENCERESOLVER)/CoreferenceWriter.h $(COREFERENCERESOLVER)/CoreferenceWriter.cpp $(COREFERENCERESOLVER)/CoreferenceDocument.h $(CLASSIFIER)/Writer.h + $(CC) $(CFLAGS) $(COREFERENCERESOLVER)/CoreferenceWriter.cpp + +##################### + +SemanticDecoder.o: $(SEMANTICPARSER)/SemanticDecoder.h $(SEMANTICPARSER)/SemanticDecoder.cpp $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticPipe.h $(PARSER)/FactorTree.h $(SEMANTICPARSER)/FactorPredicateAutomaton.h $(SEMANTICPARSER)/FactorArgumentAutomaton.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticDecoder.cpp + +SemanticDictionary.o: $(SEMANTICPARSER)/SemanticDictionary.h $(SEMANTICPARSER)/SemanticDictionary.cpp $(SEMANTICPARSER)/SemanticPipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticDictionary.cpp + +SemanticFeatures.o: $(SEMANTICPARSER)/SemanticFeatures.h $(SEMANTICPARSER)/SemanticFeatures.cpp $(SEMANTICPARSER)/SemanticPipe.h $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticFeatureTemplates.h $(CLASSIFIER)/Features.h $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticFeatures.cpp + +SemanticInstance.o: $(SEMANTICPARSER)/SemanticInstance.h $(SEMANTICPARSER)/SemanticInstance.cpp $(CLASSIFIER)/Instance.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticInstance.cpp + +SemanticInstanceNumeric.o: $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(SEMANTICPARSER)/SemanticInstanceNumeric.cpp $(SEMANTICPARSER)/SemanticInstance.h $(SEMANTICPARSER)/SemanticDictionary.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticInstanceNumeric.cpp + +SemanticOptions.o: $(SEMANTICPARSER)/SemanticOptions.h $(SEMANTICPARSER)/SemanticOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticOptions.cpp + +SemanticPart.o: $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticPart.cpp $(CLASSIFIER)/Part.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticPart.cpp + +SemanticPipe.o: $(SEMANTICPARSER)/SemanticPipe.h $(SEMANTICPARSER)/SemanticPipe.cpp $(CLASSIFIER)/Pipe.h $(SEMANTICPARSER)/SemanticOptions.h $(SEMANTICPARSER)/SemanticReader.h $(SEMANTICPARSER)/SemanticDictionary.h $(SEQUENCE)/TokenDictionary.h $(SEMANTICPARSER)/SemanticInstanceNumeric.h $(SEMANTICPARSER)/SemanticWriter.h $(SEMANTICPARSER)/SemanticPart.h $(SEMANTICPARSER)/SemanticFeatures.h $(SEMANTICPARSER)/SemanticDecoder.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticPipe.cpp + +SemanticReader.o: $(SEMANTICPARSER)/SemanticReader.h $(SEMANTICPARSER)/SemanticReader.cpp $(SEMANTICPARSER)/SemanticInstance.h $(CLASSIFIER)/Reader.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticReader.cpp + +SemanticWriter.o: $(SEMANTICPARSER)/SemanticWriter.h $(SEMANTICPARSER)/SemanticWriter.cpp $(SEMANTICPARSER)/SemanticInstance.h $(CLASSIFIER)/Writer.h + $(CC) $(CFLAGS) $(SEMANTICPARSER)/SemanticWriter.cpp + +##################### + +DependencyDecoder.o: $(PARSER)/DependencyDecoder.h $(PARSER)/DependencyDecoder.cpp $(PARSER)/DependencyPart.h $(PARSER)/DependencyPipe.h $(PARSER)/FactorTree.h $(PARSER)/FactorHeadAutomaton.h $(PARSER)/FactorGrandparentHeadAutomaton.h $(PARSER)/FactorTrigramHeadAutomaton.h $(PARSER)/FactorSequence.h $(UTIL)/AlgUtils.h $(UTIL)/logval.h $(CLASSIFIER)/Decoder.h + $(CC) $(CFLAGS) $(PARSER)/DependencyDecoder.cpp + +DependencyDictionary.o: $(PARSER)/DependencyDictionary.h $(PARSER)/DependencyDictionary.cpp $(PARSER)/DependencyPipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h + $(CC) $(CFLAGS) $(PARSER)/DependencyDictionary.cpp + +DependencyFeatures.o: $(PARSER)/DependencyFeatures.h $(PARSER)/DependencyFeatures.cpp $(PARSER)/DependencyPipe.h $(PARSER)/DependencyPart.h $(PARSER)/DependencyFeatureTemplates.h $(CLASSIFIER)/Features.h $(PARSER)/DependencyInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h + $(CC) $(CFLAGS) $(PARSER)/DependencyFeatures.cpp + +DependencyInstance.o: $(PARSER)/DependencyInstance.h $(PARSER)/DependencyInstance.cpp $(CLASSIFIER)/Instance.h + $(CC) $(CFLAGS) $(PARSER)/DependencyInstance.cpp + +DependencyInstanceNumeric.o: $(PARSER)/DependencyInstanceNumeric.h $(PARSER)/DependencyInstanceNumeric.cpp $(PARSER)/DependencyInstance.h $(PARSER)/DependencyDictionary.h + $(CC) $(CFLAGS) $(PARSER)/DependencyInstanceNumeric.cpp + +DependencyOptions.o: $(PARSER)/DependencyOptions.h $(PARSER)/DependencyOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h + $(CC) $(CFLAGS) $(PARSER)/DependencyOptions.cpp + +DependencyPart.o: $(PARSER)/DependencyPart.h $(PARSER)/DependencyPart.cpp $(CLASSIFIER)/Part.h + $(CC) $(CFLAGS) $(PARSER)/DependencyPart.cpp + +DependencyPipe.o: $(PARSER)/DependencyPipe.h $(PARSER)/DependencyPipe.cpp $(CLASSIFIER)/Pipe.h $(PARSER)/DependencyOptions.h $(PARSER)/DependencyReader.h $(PARSER)/DependencyDictionary.h $(SEQUENCE)/TokenDictionary.h $(PARSER)/DependencyInstanceNumeric.h $(PARSER)/DependencyWriter.h $(PARSER)/DependencyPart.h $(PARSER)/DependencyFeatures.h $(PARSER)/DependencyDecoder.h + $(CC) $(CFLAGS) $(PARSER)/DependencyPipe.cpp + +DependencyReader.o: $(PARSER)/DependencyReader.h $(PARSER)/DependencyReader.cpp $(PARSER)/DependencyInstance.h $(CLASSIFIER)/Reader.h + $(CC) $(CFLAGS) $(PARSER)/DependencyReader.cpp + +DependencyWriter.o: $(PARSER)/DependencyWriter.h $(PARSER)/DependencyWriter.cpp $(PARSER)/DependencyInstance.h $(CLASSIFIER)/Writer.h + $(CC) $(CFLAGS) $(PARSER)/DependencyWriter.cpp + +##################### + +TaggerDictionary.o: $(TAGGER)/TaggerDictionary.h $(TAGGER)/TaggerDictionary.cpp $(TAGGER)/TaggerPipe.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h + $(CC) $(CFLAGS) $(TAGGER)/TaggerDictionary.cpp + +TaggerFeatures.o: $(TAGGER)/TaggerFeatures.h $(TAGGER)/TaggerFeatures.cpp $(TAGGER)/TaggerPipe.h $(TAGGER)/TaggerFeatureTemplates.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h + $(CC) $(CFLAGS) $(TAGGER)/TaggerFeatures.cpp + +TaggerOptions.o: $(TAGGER)/TaggerOptions.h $(TAGGER)/TaggerOptions.cpp $(UTIL)/SerializationUtils.h $(SEQUENCE)/SequenceOptions.h + $(CC) $(CFLAGS) $(TAGGER)/TaggerOptions.cpp + +TaggerPipe.o: $(TAGGER)/TaggerPipe.h $(TAGGER)/TaggerPipe.cpp $(SEQUENCE)/SequencePipe.h $(TAGGER)/TaggerOptions.h $(TAGGER)/TaggerDictionary.h + $(CC) $(CFLAGS) $(TAGGER)/TaggerPipe.cpp + +##################### + +MorphologicalDictionary.o: $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.h $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.cpp $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.cpp + +MorphologicalFeatures.o: $(MORPHOLOGICALTAGGER)/MorphologicalFeatures.h $(MORPHOLOGICALTAGGER)/MorphologicalFeatures.cpp $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(MORPHOLOGICALTAGGER)/MorphologicalFeatureTemplates.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalFeatures.cpp + +MorphologicalInstance.o: $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(MORPHOLOGICALTAGGER)/MorphologicalInstance.cpp $(SEQUENCE)/SequenceInstance.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalInstance.cpp + +MorphologicalInstanceNumeric.o: $(MORPHOLOGICALTAGGER)/MorphologicalInstanceNumeric.h $(MORPHOLOGICALTAGGER)/MorphologicalInstanceNumeric.cpp $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(SEQUENCE)/SequenceInstanceNumeric.h $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalInstanceNumeric.cpp + +MorphologicalOptions.o: $(MORPHOLOGICALTAGGER)/MorphologicalOptions.h $(MORPHOLOGICALTAGGER)/MorphologicalOptions.cpp $(UTIL)/SerializationUtils.h $(SEQUENCE)/SequenceOptions.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalOptions.cpp + +MorphologicalPipe.o: $(MORPHOLOGICALTAGGER)/MorphologicalPipe.h $(MORPHOLOGICALTAGGER)/MorphologicalPipe.cpp $(SEQUENCE)/SequencePipe.h $(MORPHOLOGICALTAGGER)/MorphologicalOptions.h $(MORPHOLOGICALTAGGER)/MorphologicalDictionary.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalPipe.cpp + +MorphologicalReader.o: $(MORPHOLOGICALTAGGER)/MorphologicalReader.h $(MORPHOLOGICALTAGGER)/MorphologicalReader.cpp $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(SEQUENCE)/SequenceReader.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalReader.cpp + +MorphologicalWriter.o: $(MORPHOLOGICALTAGGER)/MorphologicalWriter.h $(MORPHOLOGICALTAGGER)/MorphologicalWriter.cpp $(MORPHOLOGICALTAGGER)/MorphologicalInstance.h $(SEQUENCE)/SequenceWriter.h + $(CC) $(CFLAGS) $(MORPHOLOGICALTAGGER)/MorphologicalWriter.cpp + +##################### + +EntityDictionary.o: $(ENTITYRECOGNIZER)/EntityDictionary.h $(ENTITYRECOGNIZER)/EntityDictionary.cpp $(ENTITYRECOGNIZER)/EntityPipe.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityDictionary.cpp + +EntityFeatures.o: $(ENTITYRECOGNIZER)/EntityFeatures.h $(ENTITYRECOGNIZER)/EntityFeatures.cpp $(ENTITYRECOGNIZER)/EntityPipe.h $(ENTITYRECOGNIZER)/EntityFeatureTemplates.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceInstanceNumeric.h $(CLASSIFIER)/FeatureEncoder.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityFeatures.cpp + +EntityOptions.o: $(ENTITYRECOGNIZER)/EntityOptions.h $(ENTITYRECOGNIZER)/EntityOptions.cpp $(UTIL)/SerializationUtils.h $(SEQUENCE)/SequenceOptions.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityOptions.cpp + +EntityPipe.o: $(ENTITYRECOGNIZER)/EntityPipe.h $(ENTITYRECOGNIZER)/EntityPipe.cpp $(SEQUENCE)/SequencePipe.h $(ENTITYRECOGNIZER)/EntityOptions.h $(ENTITYRECOGNIZER)/EntityDictionary.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityPipe.cpp + +EntityReader.o: $(ENTITYRECOGNIZER)/EntityReader.h $(ENTITYRECOGNIZER)/EntityReader.cpp $(ENTITYRECOGNIZER)/EntityInstance.h $(SEQUENCE)/SequenceReader.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityReader.cpp + +EntityWriter.o: $(ENTITYRECOGNIZER)/EntityWriter.h $(ENTITYRECOGNIZER)/EntityWriter.cpp $(ENTITYRECOGNIZER)/EntityInstance.h $(SEQUENCE)/SequenceWriter.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityWriter.cpp + +EntityInstance.o: $(ENTITYRECOGNIZER)/EntityInstance.h $(ENTITYRECOGNIZER)/EntityInstance.cpp $(SEQUENCE)/SequenceInstance.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityInstance.cpp + +EntityInstanceNumeric.o: $(ENTITYRECOGNIZER)/EntityInstanceNumeric.h $(ENTITYRECOGNIZER)/EntityInstanceNumeric.cpp $(ENTITYRECOGNIZER)/EntityInstance.h $(SEQUENCE)/SequenceInstanceNumeric.h $(ENTITYRECOGNIZER)/EntityDictionary.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityInstanceNumeric.cpp + +EntityDecoder.o: $(ENTITYRECOGNIZER)/EntityDecoder.h $(ENTITYRECOGNIZER)/EntityDecoder.cpp $(SEQUENCE)/SequencePart.h $(ENTITYRECOGNIZER)/EntityPipe.h $(SEQUENCE)/SequenceDecoder.h + $(CC) $(CFLAGS) $(ENTITYRECOGNIZER)/EntityDecoder.cpp + +##################### + +SequenceDecoder.o: $(SEQUENCE)/SequenceDecoder.h $(SEQUENCE)/SequenceDecoder.cpp $(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequencePipe.h $(CLASSIFIER)/Decoder.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequenceDecoder.cpp + +SequenceDictionary.o: $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/SequenceDictionary.cpp $(SEQUENCE)/SequencePipe.h $(CLASSIFIER)/Dictionary.h $(SEQUENCE)/TokenDictionary.h $(UTIL)/SerializationUtils.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequenceDictionary.cpp + +SequenceInstance.o: $(SEQUENCE)/SequenceInstance.h $(SEQUENCE)/SequenceInstance.cpp $(CLASSIFIER)/Instance.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequenceInstance.cpp + +SequenceInstanceNumeric.o: $(SEQUENCE)/SequenceInstanceNumeric.h $(SEQUENCE)/SequenceInstanceNumeric.cpp $(SEQUENCE)/SequenceInstance.h $(SEQUENCE)/SequenceDictionary.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequenceInstanceNumeric.cpp + +SequenceOptions.o: $(SEQUENCE)/SequenceOptions.h $(SEQUENCE)/SequenceOptions.cpp $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Options.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequenceOptions.cpp + +SequencePart.o: $(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequencePart.cpp $(CLASSIFIER)/Part.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequencePart.cpp + +SequencePipe.o: $(SEQUENCE)/SequencePipe.h $(SEQUENCE)/SequencePipe.cpp $(CLASSIFIER)/Pipe.h $(SEQUENCE)/SequenceOptions.h $(SEQUENCE)/SequenceReader.h $(SEQUENCE)/SequenceDictionary.h $(SEQUENCE)/TokenDictionary.h $(SEQUENCE)/SequenceInstanceNumeric.h $(SEQUENCE)/SequenceWriter.h $(SEQUENCE)/SequencePart.h $(SEQUENCE)/SequenceFeatures.h $(SEQUENCE)/SequenceDecoder.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequencePipe.cpp + +SequenceReader.o: $(SEQUENCE)/SequenceReader.h $(SEQUENCE)/SequenceReader.cpp $(SEQUENCE)/SequenceInstance.h $(CLASSIFIER)/Reader.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequenceReader.cpp + +SequenceWriter.o: $(SEQUENCE)/SequenceWriter.h $(SEQUENCE)/SequenceWriter.cpp $(SEQUENCE)/SequenceInstance.h $(CLASSIFIER)/Writer.h + $(CC) $(CFLAGS) $(SEQUENCE)/SequenceWriter.cpp + +TokenDictionary.o: $(SEQUENCE)/TokenDictionary.h $(SEQUENCE)/TokenDictionary.cpp $(CLASSIFIER)/Pipe.h $(UTIL)/SerializationUtils.h $(CLASSIFIER)/Dictionary.h $(CLASSIFIER)/Alphabet.h $(SEQUENCE)/SequenceReader.h $(PARSER)/DependencyReader.h + $(CC) $(CFLAGS) $(SEQUENCE)/TokenDictionary.cpp + +##################### + +Alphabet.o: $(CLASSIFIER)/Alphabet.h $(CLASSIFIER)/Alphabet.cpp $(UTIL)/SerializationUtils.h $(UTIL)/Utils.h + $(CC) $(CFLAGS) $(CLASSIFIER)/Alphabet.cpp + +Dictionary.o: $(CLASSIFIER)/Dictionary.h $(CLASSIFIER)/Dictionary.cpp + $(CC) $(CFLAGS) $(CLASSIFIER)/Dictionary.cpp + +Options.o: $(CLASSIFIER)/Options.h $(CLASSIFIER)/Options.cpp + $(CC) $(CFLAGS) $(CLASSIFIER)/Options.cpp + +Parameters.o: $(CLASSIFIER)/Parameters.h $(CLASSIFIER)/Parameters.cpp $(CLASSIFIER)/Features.h $(CLASSIFIER)/SparseParameterVector.h $(CLASSIFIER)/SparseLabeledParameterVector.h $(UTIL)/Utils.h + $(CC) $(CFLAGS) $(CLASSIFIER)/Parameters.cpp + +Pipe.o: $(CLASSIFIER)/Pipe.h $(CLASSIFIER)/Pipe.cpp $(CLASSIFIER)/Dictionary.h $(CLASSIFIER)/Features.h $(CLASSIFIER)/Part.h $(CLASSIFIER)/Reader.h $(CLASSIFIER)/Writer.h $(CLASSIFIER)/Options.h $(CLASSIFIER)/Decoder.h $(CLASSIFIER)/Parameters.h $(UTIL)/AlgUtils.h + $(CC) $(CFLAGS) $(CLASSIFIER)/Pipe.cpp + +Reader.o: $(CLASSIFIER)/Reader.h $(CLASSIFIER)/Reader.cpp $(CLASSIFIER)/Instance.h $(UTIL)/Utils.h + $(CC) $(CFLAGS) $(CLASSIFIER)/Reader.cpp + +Writer.o: $(CLASSIFIER)/Writer.h $(CLASSIFIER)/Writer.cpp $(CLASSIFIER)/Instance.h $(UTIL)/Utils.h + $(CC) $(CFLAGS) $(CLASSIFIER)/Writer.cpp + +##################### + +AlgUtils.o: $(UTIL)/AlgUtils.h $(UTIL)/AlgUtils.cpp + $(CC) $(CFLAGS) $(UTIL)/AlgUtils.cpp + +SerializationUtils.o: $(UTIL)/SerializationUtils.h $(UTIL)/SerializationUtils.cpp + $(CC) $(CFLAGS) $(UTIL)/SerializationUtils.cpp + +StringUtils.o: $(UTIL)/StringUtils.h $(UTIL)/StringUtils.cpp + $(CC) $(CFLAGS) $(UTIL)/StringUtils.cpp + +TimeUtils.o: $(UTIL)/TimeUtils.h $(UTIL)/TimeUtils.cpp + $(CC) $(CFLAGS) $(UTIL)/TimeUtils.cpp + +##################### + +clean: + rm -f *.o *~ libturboparser.a libturboparser.so diff --git a/libturboparser/TurboParserInterface.cpp b/libturboparser/TurboParserInterface.cpp index 7c903f8..fcfb724 100644 --- a/libturboparser/TurboParserInterface.cpp +++ b/libturboparser/TurboParserInterface.cpp @@ -1,387 +1,402 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "TurboParserInterface.h" - -namespace TurboParserInterface { -TurboTaggerWorker::TurboTaggerWorker() { - tagger_options_ = new TaggerOptions; - tagger_options_->Initialize(); - - tagger_pipe_ = new TaggerPipe(tagger_options_); - tagger_pipe_->Initialize(); -} - -TurboTaggerWorker::~TurboTaggerWorker() { - LOG(INFO) << "Deleting tagger pipe."; - delete tagger_pipe_; - LOG(INFO) << "Deleting tagger options."; - delete tagger_options_; -} - -void TurboTaggerWorker::LoadTaggerModel(const std::string &file_model) { - tagger_options_->SetModelFilePath(file_model); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - tagger_pipe_->LoadModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboTaggerWorker::Tag(const std::string &file_test, - const std::string &file_prediction) { - tagger_options_->SetTestFilePath(file_test); - tagger_options_->SetOutputFilePath(file_prediction); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - tagger_pipe_->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboTaggerWorker::TagSentence(SequenceInstance *sentence) { - tagger_pipe_->ClassifyInstance(sentence); -} - -TurboEntityRecognizerWorker::TurboEntityRecognizerWorker() { - entity_options_ = new EntityOptions; - entity_options_->Initialize(); - - entity_pipe_ = new EntityPipe(entity_options_); - entity_pipe_->Initialize(); -} - -TurboEntityRecognizerWorker::~TurboEntityRecognizerWorker() { - LOG(INFO) << "Deleting entity recognizer pipe."; - delete entity_pipe_; - LOG(INFO) << "Deleting entity recognizer options."; - delete entity_options_; -} - -void TurboEntityRecognizerWorker::LoadEntityRecognizerModel( - const std::string &file_model) { - entity_options_->SetModelFilePath(file_model); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - entity_pipe_->LoadModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboEntityRecognizerWorker::Tag(const std::string &file_test, - const std::string &file_prediction) { - entity_options_->SetTestFilePath(file_test); - entity_options_->SetOutputFilePath(file_prediction); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - entity_pipe_->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboEntityRecognizerWorker::TagSentence(EntityInstance *sentence) { - entity_pipe_->ClassifyInstance(sentence); -} - -TurboParserWorker::TurboParserWorker() { - parser_options_ = new DependencyOptions; - parser_options_->Initialize(); - - parser_pipe_ = new DependencyPipe(parser_options_); - parser_pipe_->Initialize(); -} - -TurboParserWorker::~TurboParserWorker() { - LOG(INFO) << "Deleting parser pipe."; - delete parser_pipe_; - LOG(INFO) << "Deleting parser options."; - delete parser_options_; -} - -void TurboParserWorker::LoadParserModel(const std::string &file_model) { - parser_options_->SetModelFilePath(file_model); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - parser_pipe_->LoadModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboParserWorker::Parse(const std::string &file_test, - const std::string &file_prediction) { - parser_options_->SetTestFilePath(file_test); - parser_options_->SetOutputFilePath(file_prediction); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - parser_pipe_->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboParserWorker::ParseSentence(DependencyInstance *sentence) { - parser_pipe_->ClassifyInstance(sentence); -} - -TurboSemanticParserWorker::TurboSemanticParserWorker() { - semantic_options_ = new SemanticOptions; - semantic_options_->Initialize(); - - semantic_pipe_ = new SemanticPipe(semantic_options_); - semantic_pipe_->Initialize(); -} - -TurboSemanticParserWorker::~TurboSemanticParserWorker() { - LOG(INFO) << "Deleting semantic pipe."; - delete semantic_pipe_; - LOG(INFO) << "Deleting semantic options."; - delete semantic_options_; -} - -void TurboSemanticParserWorker::LoadSemanticParserModel( - const std::string &file_model) { - semantic_options_->SetModelFilePath(file_model); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - LOG(INFO) << "Loading model file " << file_model; - - semantic_pipe_->LoadModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboSemanticParserWorker::ParseSemanticDependencies( - const std::string &file_test, - const std::string &file_prediction) { - semantic_options_->SetTestFilePath(file_test); - semantic_options_->SetOutputFilePath(file_prediction); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - semantic_pipe_->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboSemanticParserWorker::ParseSemanticDependenciesFromSentence( - SemanticInstance *sentence) { - semantic_pipe_->ClassifyInstance(sentence); -} - -TurboCoreferenceResolverWorker::TurboCoreferenceResolverWorker() { - coreference_options_ = new CoreferenceOptions; - coreference_options_->Initialize(); - - coreference_pipe_ = new CoreferencePipe(coreference_options_); - coreference_pipe_->Initialize(); -} - -TurboCoreferenceResolverWorker::~TurboCoreferenceResolverWorker() { - LOG(INFO) << "Deleting coreference pipe."; - delete coreference_pipe_; - LOG(INFO) << "Deleting coreference options."; - delete coreference_options_; -} - -void TurboCoreferenceResolverWorker::LoadCoreferenceResolverModel( - const std::string &file_model) { - coreference_options_->SetModelFilePath(file_model); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - LOG(INFO) << "Loading model file " << file_model; - - coreference_pipe_->LoadModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboCoreferenceResolverWorker::ResolveCoreferences( - const std::string &file_test, - const std::string &file_prediction) { - coreference_options_->SetTestFilePath(file_test); - coreference_options_->SetOutputFilePath(file_prediction); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - coreference_pipe_->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboCoreferenceResolverWorker::ResolveCoreferencesFromDocument( - CoreferenceDocument *document) { - coreference_pipe_->ClassifyInstance(document); -} - -TurboMorphologicalTaggerWorker::TurboMorphologicalTaggerWorker() { - morphological_tagger_options_ = new MorphologicalOptions; - morphological_tagger_options_->Initialize(); - - morphological_tagger_pipe_ = - new MorphologicalPipe(morphological_tagger_options_); - morphological_tagger_pipe_->Initialize(); -} - -TurboMorphologicalTaggerWorker::~TurboMorphologicalTaggerWorker() { - LOG(INFO) << "Deleting tagger pipe."; - delete morphological_tagger_pipe_; - LOG(INFO) << "Deleting tagger options."; - delete morphological_tagger_options_; -} - -void TurboMorphologicalTaggerWorker::LoadMorphologicalTaggerModel( - const std::string - &file_model) { - morphological_tagger_options_->SetModelFilePath(file_model); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - morphological_tagger_pipe_->LoadModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboMorphologicalTaggerWorker::Tag(const std::string &file_test, - const std::string &file_prediction) { - morphological_tagger_options_->SetTestFilePath(file_test); - morphological_tagger_options_->SetOutputFilePath(file_prediction); - - int time; - timeval start, end; - gettimeofday(&start, NULL); - - morphological_tagger_pipe_->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TurboMorphologicalTaggerWorker::TagSentence( - MorphologicalInstance *sentence) { - morphological_tagger_pipe_->ClassifyInstance(sentence); -} - -TurboParserInterface::TurboParserInterface() { - argc_ = 0; - argv_ = NULL; - BuildArgumentList(); - - // Initialize Google's logging library. - google::InitGoogleLogging(argv_[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc_, &argv_, false); - -#ifdef _WIN32 - google::LogToStderr(); -#endif -} - -TurboParserInterface::~TurboParserInterface() { - LOG(INFO) << "Deleting tagger workers."; - DeleteAllTaggers(); - - LOG(INFO) << "Deleting morphological tagger workers."; - DeleteAllMorphologicalTaggers(); - - LOG(INFO) << "Deleting entity recognizer workers."; - DeleteAllEntityRecognizers(); - - LOG(INFO) << "Deleting parser workers."; - DeleteAllParsers(); - - LOG(INFO) << "Deleting semantic parser workers."; - DeleteAllSemanticParsers(); - - LOG(INFO) << "Deleting coreference resolver workers."; - DeleteAllCoreferenceResolvers(); - - LOG(INFO) << "Clearing argument list."; - ClearArgumentList(); - - LOG(INFO) << "Done."; -} -} // namespace TurboParserInterface. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "TurboParserInterface.h" + +namespace TurboParserInterface { +TurboTaggerWorker::TurboTaggerWorker() { + tagger_options_ = new TaggerOptions; + tagger_options_->Initialize(); + + tagger_pipe_ = new TaggerPipe(tagger_options_); + tagger_pipe_->Initialize(); +} + +TurboTaggerWorker::~TurboTaggerWorker() { + LOG(INFO) << "Deleting tagger pipe."; + delete tagger_pipe_; + LOG(INFO) << "Deleting tagger options."; + delete tagger_options_; +} + +void TurboTaggerWorker::LoadTaggerModel(const std::string &file_model) { + tagger_options_->SetModelFilePath(file_model); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + LOG(INFO) << "Loading model file " << file_model; + + tagger_pipe_->LoadModelFile(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboTaggerWorker::Tag(const std::string &file_test, + const std::string &file_prediction) { + tagger_options_->SetTestFilePath(file_test); + tagger_options_->SetOutputFilePath(file_prediction); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + tagger_pipe_->Run(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboTaggerWorker::TagSentence(SequenceInstance *sentence) { + if (sentence->size() == 0) + return; + tagger_pipe_->ClassifyInstance(sentence); +} + +TurboEntityRecognizerWorker::TurboEntityRecognizerWorker() { + entity_options_ = new EntityOptions; + entity_options_->Initialize(); + + entity_pipe_ = new EntityPipe(entity_options_); + entity_pipe_->Initialize(); +} + +TurboEntityRecognizerWorker::~TurboEntityRecognizerWorker() { + LOG(INFO) << "Deleting entity recognizer pipe."; + delete entity_pipe_; + LOG(INFO) << "Deleting entity recognizer options."; + delete entity_options_; +} + +void TurboEntityRecognizerWorker::LoadEntityRecognizerModel( + const std::string &file_model) { + entity_options_->SetModelFilePath(file_model); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + LOG(INFO) << "Loading model file " << file_model; + + entity_pipe_->LoadModelFile(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboEntityRecognizerWorker::Tag(const std::string &file_test, + const std::string &file_prediction) { + entity_options_->SetTestFilePath(file_test); + entity_options_->SetOutputFilePath(file_prediction); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + entity_pipe_->Run(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboEntityRecognizerWorker::TagSentence(EntityInstance *sentence) { + if (sentence->size() == 0) + return; + entity_pipe_->ClassifyInstance(sentence); +} + +TurboParserWorker::TurboParserWorker() { + parser_options_ = new DependencyOptions; + parser_options_->Initialize(); + + parser_pipe_ = new DependencyPipe(parser_options_); + parser_pipe_->Initialize(); +} + +TurboParserWorker::~TurboParserWorker() { + LOG(INFO) << "Deleting parser pipe."; + delete parser_pipe_; + LOG(INFO) << "Deleting parser options."; + delete parser_options_; +} + +void TurboParserWorker::LoadParserModel(const std::string &file_model) { + parser_options_->SetModelFilePath(file_model); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + LOG(INFO) << "Loading model file " << file_model; + + parser_pipe_->LoadModelFile(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboParserWorker::Parse(const std::string &file_test, + const std::string &file_prediction) { + parser_options_->SetTestFilePath(file_test); + parser_options_->SetOutputFilePath(file_prediction); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + parser_pipe_->Run(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboParserWorker::ParseSentence(DependencyInstance *sentence) { + if (sentence->size() == 0) + return; + parser_pipe_->ClassifyInstance(sentence); +} + +TurboSemanticParserWorker::TurboSemanticParserWorker() { + semantic_options_ = new SemanticOptions; + semantic_options_->Initialize(); + + semantic_pipe_ = new SemanticPipe(semantic_options_); + semantic_pipe_->Initialize(); +} + +TurboSemanticParserWorker::~TurboSemanticParserWorker() { + LOG(INFO) << "Deleting semantic pipe."; + delete semantic_pipe_; + LOG(INFO) << "Deleting semantic options."; + delete semantic_options_; +} + +void TurboSemanticParserWorker::LoadSemanticParserModel( + const std::string &file_model) { + semantic_options_->SetModelFilePath(file_model); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + LOG(INFO) << "Loading model file " << file_model; + + semantic_pipe_->LoadModelFile(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboSemanticParserWorker::ParseSemanticDependencies( + const std::string &file_test, + const std::string &file_prediction) { + semantic_options_->SetTestFilePath(file_test); + semantic_options_->SetOutputFilePath(file_prediction); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + semantic_pipe_->Run(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboSemanticParserWorker::ParseSemanticDependenciesFromSentence( + SemanticInstance *sentence) { + if (sentence->size() == 0) + return; + semantic_pipe_->ClassifyInstance(sentence); +} + +TurboCoreferenceResolverWorker::TurboCoreferenceResolverWorker() { + coreference_options_ = new CoreferenceOptions; + coreference_options_->Initialize(); + + coreference_pipe_ = new CoreferencePipe(coreference_options_); + coreference_pipe_->Initialize(); +} + +TurboCoreferenceResolverWorker::~TurboCoreferenceResolverWorker() { + LOG(INFO) << "Deleting coreference pipe."; + delete coreference_pipe_; + LOG(INFO) << "Deleting coreference options."; + delete coreference_options_; +} + +void TurboCoreferenceResolverWorker::LoadCoreferenceResolverModel( + const std::string &file_model) { + coreference_options_->SetModelFilePath(file_model); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + LOG(INFO) << "Loading model file " << file_model; + + coreference_pipe_->LoadModelFile(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboCoreferenceResolverWorker::ResolveCoreferences( + const std::string &file_test, + const std::string &file_prediction) { + coreference_options_->SetTestFilePath(file_test); + coreference_options_->SetOutputFilePath(file_prediction); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + coreference_pipe_->Run(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboCoreferenceResolverWorker::ResolveCoreferencesFromDocument( + CoreferenceDocument *document) { + if (document->GetNumSentences() == 0) + return; + coreference_pipe_->ClassifyInstance(document); +} + +TurboMorphologicalTaggerWorker::TurboMorphologicalTaggerWorker() { + morphological_tagger_options_ = new MorphologicalOptions; + morphological_tagger_options_->Initialize(); + + morphological_tagger_pipe_ = + new MorphologicalPipe(morphological_tagger_options_); + morphological_tagger_pipe_->Initialize(); +} + +TurboMorphologicalTaggerWorker::~TurboMorphologicalTaggerWorker() { + LOG(INFO) << "Deleting tagger pipe."; + delete morphological_tagger_pipe_; + LOG(INFO) << "Deleting tagger options."; + delete morphological_tagger_options_; +} + +void TurboMorphologicalTaggerWorker::LoadMorphologicalTaggerModel( + const std::string + &file_model) { + morphological_tagger_options_->SetModelFilePath(file_model); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + LOG(INFO) << "Loading model file " << file_model; + + morphological_tagger_pipe_->LoadModelFile(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboMorphologicalTaggerWorker::Tag(const std::string &file_test, + const std::string &file_prediction) { + morphological_tagger_options_->SetTestFilePath(file_test); + morphological_tagger_options_->SetOutputFilePath(file_prediction); + + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + morphological_tagger_pipe_->Run(); + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Took " << time << " sec." << endl; +} + +void TurboMorphologicalTaggerWorker::TagSentence( + MorphologicalInstance *sentence) { + if (sentence->size() == 0) + return; + morphological_tagger_pipe_->ClassifyInstance(sentence); +} + +TurboParserInterface::TurboParserInterface() { + argc_ = 0; + argv_ = NULL; + BuildArgumentList(); + +#ifndef BYPASSINIT_GLOG + std::cout << "Init glog from libturboparser..." << std::endl; + InitGlog(argv_[0]); +#else + std::cout << "Bypassing glog in libturboparser..." << std::endl; +#endif + + // Initialize Google's logging library. + //google::InitGoogleLogging(argv_[0]); + + // Parse command line flags. + //google::ParseCommandLineFlags(&argc_, &argv_, true); + +//#ifdef _WIN32 +// google::LogToStderr(); +//#endif +} + +TurboParserInterface::~TurboParserInterface() { + LOG(INFO) << "Deleting tagger workers."; + DeleteAllTaggers(); + + LOG(INFO) << "Deleting morphological tagger workers."; + DeleteAllMorphologicalTaggers(); + + LOG(INFO) << "Deleting entity recognizer workers."; + DeleteAllEntityRecognizers(); + + LOG(INFO) << "Deleting parser workers."; + DeleteAllParsers(); + + LOG(INFO) << "Deleting semantic parser workers."; + DeleteAllSemanticParsers(); + + LOG(INFO) << "Deleting coreference resolver workers."; + DeleteAllCoreferenceResolvers(); + + LOG(INFO) << "Clearing argument list."; + ClearArgumentList(); + + LOG(INFO) << "Done."; +} +} // namespace TurboParserInterface. diff --git a/libturboparser/TurboParserInterface.h b/libturboparser/TurboParserInterface.h index 8d475aa..5df8187 100644 --- a/libturboparser/TurboParserInterface.h +++ b/libturboparser/TurboParserInterface.h @@ -1,225 +1,225 @@ -#include -#include -#include "TaggerPipe.h" -#include "EntityPipe.h" -#include "DependencyPipe.h" -#include "SemanticPipe.h" -#include "CoreferencePipe.h" -#include "MorphologicalPipe.h" - -namespace TurboParserInterface { -class TurboTaggerWorker { -public: - TurboTaggerWorker(); - virtual ~TurboTaggerWorker(); - - void LoadTaggerModel(const std::string &file_model); - - void Tag(const std::string &file_test, - const std::string &file_prediction); - - void TagSentence(SequenceInstance *sentence); - -private: - TaggerOptions *tagger_options_; - TaggerPipe *tagger_pipe_; -}; - -class TurboEntityRecognizerWorker { -public: - TurboEntityRecognizerWorker(); - virtual ~TurboEntityRecognizerWorker(); - - void LoadEntityRecognizerModel(const std::string &file_model); - - void Tag(const std::string &file_test, - const std::string &file_prediction); - - void TagSentence(EntityInstance *sentence); - -private: - EntityOptions *entity_options_; - EntityPipe *entity_pipe_; -}; - -class TurboParserWorker { -public: - TurboParserWorker(); - virtual ~TurboParserWorker(); - - void LoadParserModel(const std::string &file_model); - - void Parse(const std::string &file_test, - const std::string &file_prediction); - - void ParseSentence(DependencyInstance *sentence); - -private: - DependencyOptions *parser_options_; - DependencyPipe *parser_pipe_; -}; - -class TurboSemanticParserWorker { -public: - TurboSemanticParserWorker(); - virtual ~TurboSemanticParserWorker(); - - void LoadSemanticParserModel(const std::string &file_model); - - void ParseSemanticDependencies(const std::string &file_test, - const std::string &file_prediction); - - void ParseSemanticDependenciesFromSentence(SemanticInstance *sentence); - -private: - SemanticOptions *semantic_options_; - SemanticPipe *semantic_pipe_; -}; - -class TurboCoreferenceResolverWorker { -public: - TurboCoreferenceResolverWorker(); - virtual ~TurboCoreferenceResolverWorker(); - - void LoadCoreferenceResolverModel(const std::string &file_model); - - void ResolveCoreferences(const std::string &file_test, - const std::string &file_prediction); - - void ResolveCoreferencesFromDocument(CoreferenceDocument *document); - -private: - CoreferenceOptions *coreference_options_; - CoreferencePipe *coreference_pipe_; -}; - -class TurboMorphologicalTaggerWorker { -public: - TurboMorphologicalTaggerWorker(); - virtual ~TurboMorphologicalTaggerWorker(); - - void LoadMorphologicalTaggerModel(const std::string &file_model); - - void Tag(const std::string &file_test, - const std::string &file_prediction); - - void TagSentence(MorphologicalInstance *sentence); - -private: - MorphologicalOptions *morphological_tagger_options_; - MorphologicalPipe *morphological_tagger_pipe_; -}; - -class TurboParserInterface { -public: - TurboParserInterface(); - virtual ~TurboParserInterface(); - - void ClearArgumentList() { - for (int i = 0; i < argc_; ++i) { - if (argv_[i]) free(argv_[i]); - } - delete[] argv_; - argc_ = 0; - } - - void BuildArgumentList() { - argc_ = 2; - argv_ = new char*[argc_]; - argv_[0] = strdup("TurboParser"); - argv_[1] = strdup("--logtostderr"); - } - - TurboTaggerWorker *CreateTagger() { - TurboTaggerWorker *tagger = new TurboTaggerWorker(); - taggers_.push_back(tagger); - return tagger; - } - - TurboEntityRecognizerWorker *CreateEntityRecognizer() { - TurboEntityRecognizerWorker *entity_recognizer = - new TurboEntityRecognizerWorker(); - entity_recognizers_.push_back(entity_recognizer); - return entity_recognizer; - } - - TurboParserWorker *CreateParser() { - TurboParserWorker *parser = new TurboParserWorker(); - parsers_.push_back(parser); - return parser; - } - - TurboSemanticParserWorker *CreateSemanticParser() { - TurboSemanticParserWorker *semantic_parser = - new TurboSemanticParserWorker(); - semantic_parsers_.push_back(semantic_parser); - return semantic_parser; - } - - TurboCoreferenceResolverWorker *CreateCoreferenceResolver() { - TurboCoreferenceResolverWorker *coreference_resolver = - new TurboCoreferenceResolverWorker(); - coreference_resolvers_.push_back(coreference_resolver); - return coreference_resolver; - } - - TurboMorphologicalTaggerWorker *CreateMorphologicalTagger() { - TurboMorphologicalTaggerWorker *morphological_tagger = - new TurboMorphologicalTaggerWorker(); - morphological_taggers_.push_back(morphological_tagger); - return morphological_tagger; - } - - void DeleteAllTaggers() { - for (int i = 0; i < taggers_.size(); ++i) { - delete taggers_[i]; - } - taggers_.clear(); - } - - void DeleteAllEntityRecognizers() { - for (int i = 0; i < entity_recognizers_.size(); ++i) { - delete entity_recognizers_[i]; - } - entity_recognizers_.clear(); - } - - void DeleteAllParsers() { - for (int i = 0; i < parsers_.size(); ++i) { - delete parsers_[i]; - } - parsers_.clear(); - } - - void DeleteAllSemanticParsers() { - for (int i = 0; i < semantic_parsers_.size(); ++i) { - delete semantic_parsers_[i]; - } - semantic_parsers_.clear(); - } - - void DeleteAllCoreferenceResolvers() { - for (int i = 0; i < coreference_resolvers_.size(); ++i) { - delete coreference_resolvers_[i]; - } - coreference_resolvers_.clear(); - } - - void DeleteAllMorphologicalTaggers() { - for (int i = 0; i < morphological_taggers_.size(); ++i) { - delete morphological_taggers_[i]; - } - morphological_taggers_.clear(); - } - -private: - int argc_; - char** argv_; - std::vector taggers_; - std::vector parsers_; - std::vector semantic_parsers_; - std::vector entity_recognizers_; - std::vector coreference_resolvers_; - std::vector morphological_taggers_; -}; -} // namespace TurboParserInterface. +#include +#include +#include "TaggerPipe.h" +#include "EntityPipe.h" +#include "DependencyPipe.h" +#include "SemanticPipe.h" +#include "CoreferencePipe.h" +#include "MorphologicalPipe.h" + +namespace TurboParserInterface { +class TurboTaggerWorker { +public: + TurboTaggerWorker(); + virtual ~TurboTaggerWorker(); + + void LoadTaggerModel(const std::string &file_model); + + void Tag(const std::string &file_test, + const std::string &file_prediction); + + void TagSentence(SequenceInstance *sentence); + +private: + TaggerOptions *tagger_options_; + TaggerPipe *tagger_pipe_; +}; + +class TurboEntityRecognizerWorker { +public: + TurboEntityRecognizerWorker(); + virtual ~TurboEntityRecognizerWorker(); + + void LoadEntityRecognizerModel(const std::string &file_model); + + void Tag(const std::string &file_test, + const std::string &file_prediction); + + void TagSentence(EntityInstance *sentence); + +private: + EntityOptions *entity_options_; + EntityPipe *entity_pipe_; +}; + +class TurboParserWorker { +public: + TurboParserWorker(); + virtual ~TurboParserWorker(); + + void LoadParserModel(const std::string &file_model); + + void Parse(const std::string &file_test, + const std::string &file_prediction); + + void ParseSentence(DependencyInstance *sentence); + +private: + DependencyOptions *parser_options_; + DependencyPipe *parser_pipe_; +}; + +class TurboSemanticParserWorker { +public: + TurboSemanticParserWorker(); + virtual ~TurboSemanticParserWorker(); + + void LoadSemanticParserModel(const std::string &file_model); + + void ParseSemanticDependencies(const std::string &file_test, + const std::string &file_prediction); + + void ParseSemanticDependenciesFromSentence(SemanticInstance *sentence); + +private: + SemanticOptions *semantic_options_; + SemanticPipe *semantic_pipe_; +}; + +class TurboCoreferenceResolverWorker { +public: + TurboCoreferenceResolverWorker(); + virtual ~TurboCoreferenceResolverWorker(); + + void LoadCoreferenceResolverModel(const std::string &file_model); + + void ResolveCoreferences(const std::string &file_test, + const std::string &file_prediction); + + void ResolveCoreferencesFromDocument(CoreferenceDocument *document); + +private: + CoreferenceOptions *coreference_options_; + CoreferencePipe *coreference_pipe_; +}; + +class TurboMorphologicalTaggerWorker { +public: + TurboMorphologicalTaggerWorker(); + virtual ~TurboMorphologicalTaggerWorker(); + + void LoadMorphologicalTaggerModel(const std::string &file_model); + + void Tag(const std::string &file_test, + const std::string &file_prediction); + + void TagSentence(MorphologicalInstance *sentence); + +private: + MorphologicalOptions *morphological_tagger_options_; + MorphologicalPipe *morphological_tagger_pipe_; +}; + +class TurboParserInterface { +public: + TurboParserInterface(); + virtual ~TurboParserInterface(); + + void ClearArgumentList() { + for (int i = 0; i < argc_; ++i) { + if (argv_[i]) free(argv_[i]); + } + delete[] argv_; + argc_ = 0; + } + + void BuildArgumentList() { + argc_ = 2; + argv_ = new char*[argc_]; + argv_[0] = strdup("TurboParser"); + argv_[1] = strdup("--logtostderr"); + } + + TurboTaggerWorker *CreateTagger() { + TurboTaggerWorker *tagger = new TurboTaggerWorker(); + taggers_.push_back(tagger); + return tagger; + } + + TurboEntityRecognizerWorker *CreateEntityRecognizer() { + TurboEntityRecognizerWorker *entity_recognizer = + new TurboEntityRecognizerWorker(); + entity_recognizers_.push_back(entity_recognizer); + return entity_recognizer; + } + + TurboParserWorker *CreateParser() { + TurboParserWorker *parser = new TurboParserWorker(); + parsers_.push_back(parser); + return parser; + } + + TurboSemanticParserWorker *CreateSemanticParser() { + TurboSemanticParserWorker *semantic_parser = + new TurboSemanticParserWorker(); + semantic_parsers_.push_back(semantic_parser); + return semantic_parser; + } + + TurboCoreferenceResolverWorker *CreateCoreferenceResolver() { + TurboCoreferenceResolverWorker *coreference_resolver = + new TurboCoreferenceResolverWorker(); + coreference_resolvers_.push_back(coreference_resolver); + return coreference_resolver; + } + + TurboMorphologicalTaggerWorker *CreateMorphologicalTagger() { + TurboMorphologicalTaggerWorker *morphological_tagger = + new TurboMorphologicalTaggerWorker(); + morphological_taggers_.push_back(morphological_tagger); + return morphological_tagger; + } + + void DeleteAllTaggers() { + for (int i = 0; i < taggers_.size(); ++i) { + delete taggers_[i]; + } + taggers_.clear(); + } + + void DeleteAllEntityRecognizers() { + for (int i = 0; i < entity_recognizers_.size(); ++i) { + delete entity_recognizers_[i]; + } + entity_recognizers_.clear(); + } + + void DeleteAllParsers() { + for (int i = 0; i < parsers_.size(); ++i) { + delete parsers_[i]; + } + parsers_.clear(); + } + + void DeleteAllSemanticParsers() { + for (int i = 0; i < semantic_parsers_.size(); ++i) { + delete semantic_parsers_[i]; + } + semantic_parsers_.clear(); + } + + void DeleteAllCoreferenceResolvers() { + for (int i = 0; i < coreference_resolvers_.size(); ++i) { + delete coreference_resolvers_[i]; + } + coreference_resolvers_.clear(); + } + + void DeleteAllMorphologicalTaggers() { + for (int i = 0; i < morphological_taggers_.size(); ++i) { + delete morphological_taggers_[i]; + } + morphological_taggers_.clear(); + } + +private: + int argc_; + char** argv_; + std::vector taggers_; + std::vector parsers_; + std::vector semantic_parsers_; + std::vector entity_recognizers_; + std::vector coreference_resolvers_; + std::vector morphological_taggers_; +}; +} // namespace TurboParserInterface diff --git a/morphological_tagger/scripts/train_test_morphological_tagger.sh b/morphological_tagger/scripts/train_test_morphological_tagger.sh old mode 100755 new mode 100644 diff --git a/python/nlp_document.py b/python/nlp_document.py index bd1ca4b..3713cc8 100644 --- a/python/nlp_document.py +++ b/python/nlp_document.py @@ -1,68 +1,68 @@ -import turboparser as tp -from nlp_sentence import NLPSentence - -class NLPDocument(dict): - def __init__(self, sentences): - self['sentences'] = sentences - - def compute_coreferences(self, worker): - sentences = self['sentences'] - coreference_sentences = [] - for sentence in sentences: - sentence['coreference_spans'] = None - words = sentence['words'] - tags = sentence['tags'] - lemmas = sentence['lemmas'] - # TurboParser assumes 1-based indexing. - heads = [h+1 for h in sentence['heads']] - deprels = sentence['dependency_relations'] - # For now, don't use this (must be coded as spans). - entity_tags = sentence['entity_tags'] - feats = [[] for word in words] - predicate_names = [] - predicate_indices = [] - argument_roles = [] - argument_indices = [] - speakers = ['-' for word in words] - # TurboParser requires pre-appending a dummy root symbol. - words_with_root = ['_root_'] + words - lemmas_with_root = ['_root_'] + lemmas - tags_with_root = ['_root_'] + tags - feats_with_root = [['_root_']] + feats - deprels_with_root = ['_root_'] + deprels - heads_with_root = [-1] + heads - speakers_with_root = ['__'] + speakers - p_entity_spans = [] - p_constituent_spans = [] - p_coreference_spans = [] - coreference_sentence = tp.PCoreferenceSentence() - coreference_sentence.initialize('', \ - words_with_root, \ - lemmas_with_root, \ - tags_with_root, \ - tags_with_root, \ - feats_with_root, \ - deprels_with_root, \ - heads_with_root, \ - predicate_names, \ - predicate_indices, \ - argument_roles, \ - argument_indices, \ - speakers_with_root, \ - p_entity_spans, \ - p_constituent_spans, \ - p_coreference_spans) - coreference_sentences.append(coreference_sentence) - - coreference_document = tp.PCoreferenceDocument() - coreference_document.initialize('', 0, coreference_sentences) - worker.coreference_resolver.resolve_coreferences_from_document( \ - coreference_document) - - for i, sentence in enumerate(sentences): - coreference_sentence = coreference_document.get_sentence(i) - coreference_spans = coreference_sentence.get_coreference_spans() - # Convert back to 0-based indexing. - sentence['coreference_spans'] = \ - [(span.start()-1, span.end()-1, span.name()) \ - for span in coreference_spans] +import turboparser as tp +from nlp_sentence import NLPSentence + +class NLPDocument(dict): + def __init__(self, sentences): + self['sentences'] = sentences + + def compute_coreferences(self, worker): + sentences = self['sentences'] + coreference_sentences = [] + for sentence in sentences: + sentence['coreference_spans'] = None + words = sentence['words'] + tags = sentence['tags'] + lemmas = sentence['lemmas'] + # TurboParser assumes 1-based indexing. + heads = [h+1 for h in sentence['heads']] + deprels = sentence['dependency_relations'] + # For now, don't use this (must be coded as spans). + entity_tags = sentence['entity_tags'] + feats = [[] for word in words] + predicate_names = [] + predicate_indices = [] + argument_roles = [] + argument_indices = [] + speakers = ['-' for word in words] + # TurboParser requires pre-appending a dummy root symbol. + words_with_root = ['_root_'] + words + lemmas_with_root = ['_root_'] + lemmas + tags_with_root = ['_root_'] + tags + feats_with_root = [['_root_']] + feats + deprels_with_root = ['_root_'] + deprels + heads_with_root = [-1] + heads + speakers_with_root = ['__'] + speakers + p_entity_spans = [] + p_constituent_spans = [] + p_coreference_spans = [] + coreference_sentence = tp.PCoreferenceSentence() + coreference_sentence.initialize('', \ + words_with_root, \ + lemmas_with_root, \ + tags_with_root, \ + tags_with_root, \ + feats_with_root, \ + deprels_with_root, \ + heads_with_root, \ + predicate_names, \ + predicate_indices, \ + argument_roles, \ + argument_indices, \ + speakers_with_root, \ + p_entity_spans, \ + p_constituent_spans, \ + p_coreference_spans) + coreference_sentences.append(coreference_sentence) + + coreference_document = tp.PCoreferenceDocument() + coreference_document.initialize('', 0, coreference_sentences) + worker.coreference_resolver.resolve_coreferences_from_document( \ + coreference_document) + + for i, sentence in enumerate(sentences): + coreference_sentence = coreference_document.get_sentence(i) + coreference_spans = coreference_sentence.get_coreference_spans() + # Convert back to 0-based indexing. + sentence['coreference_spans'] = \ + [(span.start()-1, span.end()-1, span.name()) \ + for span in coreference_spans] diff --git a/python/nlp_pipeline.py b/python/nlp_pipeline.py index 3a820b5..d48c6bf 100644 --- a/python/nlp_pipeline.py +++ b/python/nlp_pipeline.py @@ -1,3 +1,4 @@ +from __future__ import print_function import nltk from tokenizer.universal_word_tokenizer import UniversalWordTokenizer import lemmatizer @@ -8,6 +9,8 @@ from span import Span import os import pdb +import sys +from builtins import range class NLPPipelineWorker: def __init__(self, pipeline, language): @@ -20,7 +23,7 @@ def __init__(self, pipeline, language): self.coreference_resolver = None if language not in pipeline.models: - print 'Error: no model for language %s.' % language + print('Error: no model for language %s.' % language) raise NotImplementedError if 'splitter' in pipeline.models[language]: @@ -61,7 +64,11 @@ class NLPPipeline: def __init__(self): # Load the initialization file. configuration_filepath = os.path.dirname(os.path.realpath(__file__)) + \ - os.sep + 'nlp_pipeline.config' + os.sep + if os.name == 'nt': + configuration_filepath += 'nlp_pipeline.win.config' + else: + configuration_filepath += 'nlp_pipeline.config' self.models = {} self.load_configuration_file(configuration_filepath) self.turbo_interface = tp.PTurboParser() @@ -84,7 +91,7 @@ def load_configuration_file(self, filepath): continue if language == '': language = line - print 'Loading information for %s' % language + print('Loading information for %s' % language) self.models[language] = {} else: pair = line.split('=') @@ -183,15 +190,20 @@ def parse_semantic_dependencies(self, tokenized_sentence, tags, lemmas, #sent['heads'] = [h-1 for h in heads] sent['dependency_relations'] = deprels sent.compute_semantic_dependencies(worker) - num_predicates = len(sent['predicate_names']) - predicates = ['_' for token in tokenized_sentence] - argument_lists = [['_' for k in xrange(num_predicates)] \ - for token in tokenized_sentence] - for k in xrange(num_predicates): + num_predicates = len(sent['predicate_names']) + if sys.version_info[0] == 2: + predicates = ['_' for token in tokenized_sentence] + argument_lists = [['_' for k in xrange(num_predicates)] \ + for token in tokenized_sentence] + if sys.version_info[0] == 3: + predicates = [b'_' for token in tokenized_sentence] + argument_lists = [[b'_' for k in range(num_predicates)] \ + for token in tokenized_sentence] + for k in range(num_predicates): name = sent['predicate_names'][k] p = sent['predicate_indices'][k] predicates[p] = name - for l in xrange(len(sent['argument_roles'][k])): + for l in range(len(sent['argument_roles'][k])): role = sent['argument_roles'][k][l] a = sent['argument_indices'][k][l] argument_lists[a][k] = role @@ -222,7 +234,10 @@ def resolve_coreferences(self, all_tokenized_sentences, all_tags, for sent in doc['sentences']: spans = [] for (start, end, name) in sent['coreference_spans']: - span = Span(start, end, name) + if sys.version_info[0] == 2: + span = Span(start, end, name) + if sys.version_info[0] == 3: + span = Span(start, end, name.decode(encoding='UTF-8') ) spans.append(span) coref_info = nlp_utils.construct_coreference_info_from_spans( \ spans, len(sent['words'])) @@ -238,10 +253,19 @@ def parse_conll(self, text, language): tags, lemmas, feats = self.tag(tokenized_sentence, language) heads, deprels = self.parse(tokenized_sentence, tags, lemmas, language) - for i, token in enumerate(tokenized_sentence): - conll_str += str(i+1) + '\t' + token + '\t' + lemmas[i] + \ - '\t' + tags[i] + '\t' + tags[i] + '\t' + \ - feats[i] + '\t' + str(heads[i]+1) + '\t' + \ - deprels[i] + '\n' + if sys.version_info[0] == 2: + for i, token in enumerate(tokenized_sentence): + conll_str += str(i+1) + '\t' + token + '\t' + lemmas[i] + \ + '\t' + tags[i] + '\t' + tags[i] + '\t' + \ + feats[i] + '\t' + str(heads[i]+1) + '\t' + \ + deprels[i] + '\n' + if sys.version_info[0] == 3: + for i, token in enumerate(tokenized_sentence): + conll_str += str(i+1) + '\t' + token + '\t' + lemmas[i] + \ + '\t' + tags[i].decode(encoding='UTF-8') + '\t' + \ + tags[i].decode(encoding='UTF-8') + '\t' + \ + feats[i] + '\t' + \ + str(heads[i]+1) + '\t' + \ + deprels[i].decode(encoding='UTF-8') + '\n' conll_str += '\n' return conll_str diff --git a/python/nlp_pipeline.win.config b/python/nlp_pipeline.win.config new file mode 100644 index 0000000..867a07d --- /dev/null +++ b/python/nlp_pipeline.win.config @@ -0,0 +1,57 @@ +PT +splitter="tokenizers/punkt/portuguese.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\portuguese_floresta_v2.0_nomwe_auto\\portuguese_floresta_v2.0_nomwe_auto_tagger.model" +parser="C:\\Corpora\\TurboModels\\all_models\\portuguese_floresta_v2.0_nomwe_auto\\portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model" +lemmatizer="C:\\Corpora\\TurboModels\\all_models\\portuguese_floresta_v2.0_nomwe_auto\\portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model" + +ES +splitter="tokenizers/punkt/spanish.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\spanish_conll2009_v2.0_nomwe_auto\\spanish_conll2009_v2.0_nomwe_auto_tagger.model" +entity_recognizer="C:\\Corpora\\TurboModels\\all_models\\ner\\models\\spanish\\spanish_entity_recognizer.model" +parser="C:\\Corpora\\TurboModels\\all_models\\spanish_conll2009_v2.0_nomwe_auto\\spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model" +semantic_parser="C:\\Corpora\\TurboModels\\all_models\\srl\\models\\spanish_conll2009_v2.0_nomwe_auto\\spanish_conll2009_v2.0_nomwe_auto_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model" +coreference_resolver="C:\\Corpora\\TurboModels\\all_models\\coreference_models\\spanish\\spanish_coreference_resolver.model" +lemmatizer="C:\\Corpora\\TurboModels\\all_models\\spanish_conll2009_v2.0_nomwe_auto\\spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model" + +EN +splitter="tokenizers/punkt/english.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\english_proj\\english_proj_tagger.model" +parser="C:\\Corpora\\TurboModels\\all_models\\english_proj\\english_proj_parser_pruned-true_model-standard.model" + +EN-Nonprojective +splitter="tokenizers/punkt/english.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\english_proj\\english_proj_tagger.model" +entity_recognizer="C:\\Corpora\\TurboModels\\all_models\\ner\\models\\english\\english_entity_recognizer.DN.model" +parser="C:\\Corpora\\TurboModels\\all_models\\english\\english_parser_pruned-true_model-standard.model" +semantic_parser="C:\\Corpora\\TurboModels\\all_models\\srl\\models\\english\\english_semantic_parser_conll2008_pruned-false_model-basic_syntax-true_C-0.01_fp-0.4_fn-0.6.model" +coreference_resolver="C:\\Corpora\\TurboModels\\all_models\\coreference_models\\english_ontonotes_conll2012\\english_ontonotes_conll2012_coreference_resolver.model" +lemmatizer="C:\\Corpora\\TurboModels\\all_models\\english\\english_lemmatizer.model" +morphological_tagger="C:\\Corpora\\TurboModels\\all_models\\morphological_tagger\\models\\UD_English\\en-ud_morphological_tagger.model" + +PT-BR-Universal +splitter="tokenizers/punkt/portuguese.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\brazilian_portuguese_universal\\brazilian_portuguese_universal_tagger.model" +parser="C:\\Corpora\\TurboModels\\all_models\\brazilian_portuguese_universal\\brazilian_portuguese_universal_parser_pruned-true_model-standard.model" +morphological_tagger="C:\\Corpora\\TurboModels\\all_models\\morphological_tagger\\models\\UD_Portuguese\\pt-ud_morphological_tagger.model" + +ES-Universal +splitter="tokenizers/punkt/spanish.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\spanish_universal\\spanish_universal_tagger.model" +parser="C:\\Corpora\\TurboModels\\all_models\\spanish_universal\\spanish_universal_parser_pruned-true_model-standard.model" +morphological_tagger="C:\\Corpora\\TurboModels\\all_models\\morphological_tagger\\models\\UD_Spanish\\es-ud_morphological_tagger.model" + +FR-Universal +splitter="tokenizers/punkt/french.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\french_universal\\french_universal_tagger.model" +parser="C:\\Corpora\\TurboModels\\all_models\\french_universal\\french_universal_parser_pruned-true_model-standard.model" + +IT-Universal +splitter="tokenizers/punkt/italian.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\italian_universal\\italian_universal_tagger.model" +parser="C:\\Corpora\\TurboModels\\all_models\\italian_universal\\italian_universal_parser_pruned-true_model-standard.model" +morphological_tagger="C:\\Corpora\\TurboModels\\all_models\\morphological_tagger\\models\\UD_Italian\\it-ud_morphological_tagger.model" + +DE-Universal +splitter="tokenizers/punkt/german.pickle" +tagger="C:\\Corpora\\TurboModels\\all_models\\german_universal\\german_universal_tagger.model" +parser="C:\\Corpora\\TurboModels\\all_models\\german_universal\\german_universal_parser_pruned-true_model-standard.model" diff --git a/python/nlp_sentence.py b/python/nlp_sentence.py index 03e5584..a97cefe 100644 --- a/python/nlp_sentence.py +++ b/python/nlp_sentence.py @@ -1,129 +1,153 @@ -import turboparser as tp - -class NLPSentence(dict): - def __init__(self): - pass - - def compute_morphology(self, worker): - words = self['words'] - - # Compute POS tags. - self['tags'] = None - tags = ['_' for word in words] - sequence_instance = tp.PSequenceInstance() - sequence_instance.initialize(words, tags) - worker.tagger.tag_sentence(sequence_instance) - tags = [sequence_instance.get_tag(i) for i in xrange(len(words))] - self['tags'] = tags - - # Compute lemmas. - self['lemmas'] = None - if worker.lemmatizer != None: - lemmas = worker.lemmatizer.lemmatize_sentence(words, tags) - self['lemmas'] = lemmas - else: - lemmas = ['_' for word in words] - - # Compute morphological tags. - self['morphological_tags'] = None - if worker.morphological_tagger != None: - feats = ['_' for word in words] - morphological_instance = tp.PMorphologicalInstance() - morphological_instance.initialize(words, lemmas, tags, feats) - worker.morphological_tagger.tag_sentence(morphological_instance) - feats = [morphological_instance.get_tag(i) \ - for i in xrange(len(words))] - self['morphological_tags'] = [feat.split('|') if feat != '_' \ - else [] \ - for feat in feats] - - def compute_entities(self, worker): - words = self['words'] - tags = self['tags'] - # For now, use entity BIO tags. Later we should move to entity spans. - self['entity_tags'] = None - entity_tags = ['_' for word in words] - entity_instance = tp.PEntityInstance() - entity_instance.initialize(words, tags, entity_tags) - worker.entity_recognizer.tag_sentence(entity_instance) - entity_tags = [entity_instance.get_tag(i) for i in xrange(len(words))] - self['entity_tags'] = entity_tags - - def compute_syntactic_dependencies(self, worker): - words = self['words'] - tags = self['tags'] - lemmas = self['lemmas'] - feats = [[] for word in words] - # TurboParser assumes 1-based indexing. - heads = [0 for word in words] - deprels = ['_' for word in words] - # TurboParser requires pre-appending a dummy root symbol. - words_with_root = ['_root_'] + words - lemmas_with_root = ['_root_'] + lemmas - tags_with_root = ['_root_'] + tags - feats_with_root = [['_root_']] + feats - deprels_with_root = ['_root_'] + deprels - heads_with_root = [-1] + heads - dependency_instance = tp.PDependencyInstance() - dependency_instance.initialize(words_with_root, lemmas_with_root, \ - tags_with_root, tags_with_root, \ - feats_with_root, deprels_with_root, \ - heads_with_root) - worker.parser.parse_sentence(dependency_instance) - # Convert back to 0-based indexing. Words attached to the root will get - # head = -1. - self['heads'] = [dependency_instance.get_head(i+1)-1 \ - for i in xrange(len(words))] - self['dependency_relations'] = \ - [dependency_instance.get_dependency_relation(i+1) \ - for i in xrange(len(words))] - - def compute_semantic_dependencies(self, worker): - words = self['words'] - tags = self['tags'] - lemmas = self['lemmas'] - # TurboParser assumes 1-based indexing. - heads = [h+1 for h in self['heads']] - deprels = self['dependency_relations'] - feats = [[] for word in words] - predicate_names = [] - predicate_indices = [] - argument_roles = [] - argument_indices = [] - # TurboParser requires pre-appending a dummy root symbol. - words_with_root = ['_root_'] + words - lemmas_with_root = ['_root_'] + lemmas - tags_with_root = ['_root_'] + tags - feats_with_root = [['_root_']] + feats - deprels_with_root = ['_root_'] + deprels - heads_with_root = [-1] + heads - semantic_instance = tp.PSemanticInstance() - semantic_instance.initialize('', words_with_root, lemmas_with_root, \ - tags_with_root, tags_with_root, \ - feats_with_root, deprels_with_root, \ - heads_with_root, predicate_names, \ - predicate_indices, argument_roles, \ - argument_indices) - worker.semantic_parser.parse_semantic_dependencies_from_sentence( \ - semantic_instance) - - num_predicates = semantic_instance.get_num_predicates() - predicate_names = [semantic_instance.get_predicate_name(k) \ - for k in xrange(num_predicates)] - # Convert back to 0-based indexing. - predicate_indices = [semantic_instance.get_predicate_index(k)-1 \ - for k in xrange(num_predicates)] - argument_roles = \ - [[semantic_instance.get_argument_role(k, l) \ - for l in xrange(semantic_instance.get_num_arguments_predicate(k))] \ - for k in xrange(num_predicates)] - # Convert to back 0-based indexing. - argument_indices = \ - [[semantic_instance.get_argument_index(k, l)-1 \ - for l in xrange(semantic_instance.get_num_arguments_predicate(k))] \ - for k in xrange(num_predicates)] - - self['predicate_names'] = predicate_names - self['predicate_indices'] = predicate_indices - self['argument_roles'] = argument_roles - self['argument_indices'] = argument_indices +import turboparser as tp +from builtins import range +import sys + +class NLPSentence(dict): + def __init__(self): + pass + + def compute_morphology(self, worker): + words = self['words'] + + # Compute POS tags. + self['tags'] = None + tags = ['_' for word in words] + sequence_instance = tp.PSequenceInstance() + sequence_instance.initialize(words, tags) + worker.tagger.tag_sentence(sequence_instance) + tags = [sequence_instance.get_tag(i) for i in range(len(words))] + self['tags'] = tags + + # Compute lemmas. + self['lemmas'] = None + if worker.lemmatizer != None: + if sys.version_info[0] == 2: + lemmas = worker.lemmatizer.lemmatize_sentence(words, tags) + if sys.version_info[0] == 3: + lemmas = worker.lemmatizer.lemmatize_sentence(words, + [t.decode(encoding="UTF-8") for t in tags]) + self['lemmas'] = lemmas + else: + lemmas = ['_' for word in words] + + # Compute morphological tags. + self['morphological_tags'] = None + if worker.morphological_tagger != None: + feats = ['_' for word in words] + morphological_instance = tp.PMorphologicalInstance() + if sys.version_info[0] == 2: + morphological_instance.initialize(words, lemmas, tags, feats) + if sys.version_info[0] == 3: + morphological_instance.initialize(words, + lemmas, + [t.decode(encoding="UTF-8") for t in tags], + feats) + + worker.morphological_tagger.tag_sentence(morphological_instance) + feats = [morphological_instance.get_tag(i) \ + for i in range(len(words))] + if sys.version_info[0] == 3: + feats = [f.decode(encoding="UTF-8") for f in feats] + self['morphological_tags'] = [feat.split('|') if feat != '_' \ + else [] \ + for feat in feats] + + def compute_entities(self, worker): + words = self['words'] + tags = self['tags'] + # For now, use entity BIO tags. Later we should move to entity spans. + self['entity_tags'] = None + entity_tags = ['_' for word in words] + entity_instance = tp.PEntityInstance() + entity_instance.initialize(words, tags, entity_tags) + worker.entity_recognizer.tag_sentence(entity_instance) + entity_tags = [entity_instance.get_tag(i) for i in range(len(words))] + self['entity_tags'] = entity_tags + + def compute_syntactic_dependencies(self, worker): + words = self['words'] + tags = self['tags'] + lemmas = self['lemmas'] + feats = [[] for word in words] + # TurboParser assumes 1-based indexing. + heads = [0 for word in words] + deprels = ['_' for word in words] + # TurboParser requires pre-appending a dummy root symbol. + words_with_root = ['_root_'] + words + lemmas_with_root = ['_root_'] + lemmas + tags_with_root = ['_root_'] + tags + feats_with_root = [['_root_']] + feats + deprels_with_root = ['_root_'] + deprels + heads_with_root = [-1] + heads + dependency_instance = tp.PDependencyInstance() + dependency_instance.initialize(words_with_root, \ + lemmas_with_root, \ + tags_with_root, \ + tags_with_root, \ + feats_with_root, \ + deprels_with_root, \ + heads_with_root) + worker.parser.parse_sentence(dependency_instance) + # Convert back to 0-based indexing. Words attached to the root will get + # head = -1. + self['heads'] = [dependency_instance.get_head(i+1)-1 \ + for i in range(len(words))] + self['dependency_relations'] = \ + [dependency_instance.get_dependency_relation(i+1) \ + for i in range(len(words))] + + def compute_semantic_dependencies(self, worker): + words = self['words'] + tags = self['tags'] + lemmas = self['lemmas'] + # TurboParser assumes 1-based indexing. + heads = [h+1 for h in self['heads']] + deprels = self['dependency_relations'] + feats = [[] for word in words] + predicate_names = [] + predicate_indices = [] + argument_roles = [] + argument_indices = [] + # TurboParser requires pre-appending a dummy root symbol. + words_with_root = ['_root_'] + words + lemmas_with_root = ['_root_'] + lemmas + tags_with_root = ['_root_'] + tags + feats_with_root = [['_root_']] + feats + deprels_with_root = ['_root_'] + deprels + heads_with_root = [-1] + heads + semantic_instance = tp.PSemanticInstance() + semantic_instance.initialize('', \ + words_with_root, \ + lemmas_with_root, \ + tags_with_root, \ + tags_with_root, \ + feats_with_root, \ + deprels_with_root, \ + heads_with_root, \ + predicate_names, \ + predicate_indices, \ + argument_roles, \ + argument_indices) + worker.semantic_parser.parse_semantic_dependencies_from_sentence( \ + semantic_instance) + + num_predicates = semantic_instance.get_num_predicates() + predicate_names = [semantic_instance.get_predicate_name(k) \ + for k in range(num_predicates)] + # Convert back to 0-based indexing. + predicate_indices = [semantic_instance.get_predicate_index(k)-1 \ + for k in range(num_predicates)] + argument_roles = \ + [[semantic_instance.get_argument_role(k, l) \ + for l in range(semantic_instance.get_num_arguments_predicate(k))] \ + for k in range(num_predicates)] + # Convert to back 0-based indexing. + argument_indices = \ + [[semantic_instance.get_argument_index(k, l)-1 \ + for l in range(semantic_instance.get_num_arguments_predicate(k))] \ + for k in range(num_predicates)] + + self['predicate_names'] = predicate_names + self['predicate_indices'] = predicate_indices + self['argument_roles'] = argument_roles + self['argument_indices'] = argument_indices diff --git a/python/nlp_utils.py b/python/nlp_utils.py index 3c2543f..7751112 100644 --- a/python/nlp_utils.py +++ b/python/nlp_utils.py @@ -1,65 +1,66 @@ -from span import * - -def construct_coreference_spans_from_text(span_lines): - left_bracket = '(' - right_bracket = ')' - characters_to_ignore = '*-' - name = '' - span_names_stack = [] - span_start_stack = [] - spans = [] - - for i in xrange(len(span_lines)): - line = span_lines[i] - fields = line.split('|') - for field in fields: - if field[0] == left_bracket and field[-1] == right_bracket: - start_position = i - end_position = i - name = field[1:-1] - span = Span(start_position, end_position, name) - spans.append(span) - elif field[0] == left_bracket: - start_position = i - end_position = -1 - name = field[1:] - span = Span(start_position, end_position, name) - spans.append(span) - elif field[-1] == right_bracket: - name = field[:-1] - selected_span = None - for span in reversed(spans): - if span.name == name and span.end == -1: - assert selected_span == None, pdb.set_trace() - selected_span = span - break - assert selected_span != None, pdb.set_trace() - selected_span.end = i - - for span in spans: - assert span.end != -1 - - return spans - -def construct_coreference_info_from_spans(spans, num_words): - coreference_span_descriptions = [''] * num_words - for span in spans: - if span.start == span.end: # Single-word mention. - if coreference_span_descriptions[span.start] != '': - coreference_span_descriptions[span.start] += '|' - coreference_span_descriptions[span.start] += '(' + span.name + ')' - else: - desc = coreference_span_descriptions[span.start] - if desc != '': - desc = '|' + desc - coreference_span_descriptions[span.start] = '(' + span.name + desc - desc = coreference_span_descriptions[span.end] - if desc != '': - desc += '|' - coreference_span_descriptions[span.end] = desc + span.name + ')' - - for j in xrange(num_words): - if coreference_span_descriptions[j] == '': - coreference_span_descriptions[j] = '_' - - return coreference_span_descriptions +from span import * +from builtins import range + +def construct_coreference_spans_from_text(span_lines): + left_bracket = '(' + right_bracket = ')' + characters_to_ignore = '*-' + name = '' + span_names_stack = [] + span_start_stack = [] + spans = [] + + for i in range(len(span_lines)): + line = span_lines[i] + fields = line.split('|') + for field in fields: + if field[0] == left_bracket and field[-1] == right_bracket: + start_position = i + end_position = i + name = field[1:-1] + span = Span(start_position, end_position, name) + spans.append(span) + elif field[0] == left_bracket: + start_position = i + end_position = -1 + name = field[1:] + span = Span(start_position, end_position, name) + spans.append(span) + elif field[-1] == right_bracket: + name = field[:-1] + selected_span = None + for span in reversed(spans): + if span.name == name and span.end == -1: + assert selected_span == None, pdb.set_trace() + selected_span = span + break + assert selected_span != None, pdb.set_trace() + selected_span.end = i + + for span in spans: + assert span.end != -1 + + return spans + +def construct_coreference_info_from_spans(spans, num_words): + coreference_span_descriptions = [''] * num_words + for span in spans: + if span.start == span.end: # Single-word mention. + if coreference_span_descriptions[span.start] != '': + coreference_span_descriptions[span.start] += '|' + coreference_span_descriptions[span.start] += '(' + span.name + ')' + else: + desc = coreference_span_descriptions[span.start] + if desc != '': + desc = '|' + desc + coreference_span_descriptions[span.start] = '(' + span.name + desc + desc = coreference_span_descriptions[span.end] + if desc != '': + desc += '|' + coreference_span_descriptions[span.end] = desc + span.name + ')' + + for j in range(num_words): + if coreference_span_descriptions[j] == '': + coreference_span_descriptions[j] = '_' + + return coreference_span_descriptions diff --git a/python/setup.py b/python/setup.py index 8cfee8f..db22a31 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,13 +1,63 @@ -from distutils.core import setup -from distutils.extension import Extension -from Cython.Distutils import build_ext - -src = "../libturboparser/" - -setup(cmdclass={'build_ext': build_ext}, - ext_modules=[Extension("turboparser", ["turbo_parser.pyx"], - language="c++", - extra_compile_args=["-std=c++0x"], - include_dirs=["../src/morphological_tagger", "../src/coreference_resolver", "../src/semantic_parser", "../src/parser", "../src/entity_recognizer/", "../src/tagger/", "../src/sequence/", "../src/classifier/", "../src/util", "../deps/local/include/"], - library_dirs=[src, "../deps/local/lib/"], - libraries=["turboparser", "gflags", "glog", "ad3"])]) +from distutils.core import setup +from distutils.extension import Extension +from Cython.Distutils import build_ext +from Cython.Build import cythonize +import os + +if os.name == 'nt': + ext_modules=[Extension("turboparser", + ["turboparser.pyx"], + language="c++", + extra_compile_args=["/Zi", "/Od", "/DGOOGLE_GLOG_DLL_DECL=", "/DGFLAGS_DLL_DECL="], + extra_link_args=['/DEBUG'], + include_dirs=["..\\src\\util", + "..\\src\\classifier", + "..\\src\\sequence", + "..\\src\\entity_recognizer", + "..\\src\\morphological_tagger", + "..\\src\\parser", + "..\\src\\tagger", + "..\\src\\semantic_parser", + "..\\src\\coreference_resolver", + "..\\deps\\AD3-2.0.2\\ad3", + "..\\deps\\AD3-2.0.2", + "..\\deps\\glog-0.3.2\\src\\windows", + "..\\deps\\gflags-2.0\\src\\windows", + "..\\deps\\eigen-eigen-c58038c56923", + "..\\deps\\googletest\\src"], + library_dirs=["..\\vsprojects\\x64\\Release", + "..\\deps\\glog-0.3.2\\x64\\Release", + "..\\deps\\gflags-2.0\\x64\\Release", + "..\\deps\\AD3-2.0.2\\vsprojects\\x64\\Release", + "..\\deps\\googletest\\msvc\\x64\\Release"], + extra_objects=["libturboparser.lib", + "AD3_140mdx64.lib", + "libgflags_140mdx64.lib", + "libglog_static_140mdx64.lib", + "gtest-md_140mdx64.lib"])] + setup(cmdclass={'build_ext': build_ext}, + ext_modules = cythonize(ext_modules, gdb_debug=True) + ) +else: + ext_modules=[Extension("turboparser", + ["turbo_parser.pyx"], + language="c++", + extra_compile_args=["-std=c++0x"], + include_dirs=["../src/morphological_tagger", + "../src/coreference_resolver", + "../src/semantic_parser", + "../src/parser", + "../src/entity_recognizer/", + "../src/tagger/", + "../src/sequence/", + "../src/classifier/", + "../src/util", + "../deps/local/include/"], + library_dirs=["../libturboparser/", + "../deps/local/lib/"], + libraries=["turboparser", + "gflags", + "glog", + "ad3"])] + setup(cmdclass={'build_ext': build_ext}, + ext_modules = ext_modules) diff --git a/python/span.py b/python/span.py index ac50649..6c3d8a7 100644 --- a/python/span.py +++ b/python/span.py @@ -1,48 +1,49 @@ -import numpy as np - -class Span: - def __init__(self, start, end, name=''): - self.start = start - self.end = end - self.name = name - - def length(self): - return self.end - self.start + 1 - - def contains_index(self, i): - return self.start <= i and self.end >= i - - def lies_inside_span(self, span): - return self.start >= span.start and self.end <= span.end - - def lies_inside_any_of_spans(self, spans): - for span in spans: - if self.lies_inside_span(span): - return True - return False - - def closest_span(self, spans, min_dist=False): - distances = [] - for span in spans: - distances.append(self.distance(span, min_dist)) - span_index = np.argmin(distances) - return spans[span_index] - - def distance(self, span, min_dist=False): - if min_dist: - start_interval = min(self.end, span.end) - end_interval = max(self.start, span.start) - return max(0, end_interval-start_interval) - else: - return abs(self.start-span.start) + abs(self.end-span.end) - - def is_equal(self, span): - return self.start == span.start and self.end == span.end - - def overlaps(self, span): - start_interval = max(self.start, span.start) - end_interval = min(self.end, span.end) - return start_interval <= end_interval - - def print_span(self): - print self.name, self.start, self.end +from __future__ import print_function +import numpy as np + +class Span: + def __init__(self, start, end, name=''): + self.start = start + self.end = end + self.name = name + + def length(self): + return self.end - self.start + 1 + + def contains_index(self, i): + return self.start <= i and self.end >= i + + def lies_inside_span(self, span): + return self.start >= span.start and self.end <= span.end + + def lies_inside_any_of_spans(self, spans): + for span in spans: + if self.lies_inside_span(span): + return True + return False + + def closest_span(self, spans, min_dist=False): + distances = [] + for span in spans: + distances.append(self.distance(span, min_dist)) + span_index = np.argmin(distances) + return spans[span_index] + + def distance(self, span, min_dist=False): + if min_dist: + start_interval = min(self.end, span.end) + end_interval = max(self.start, span.start) + return max(0, end_interval-start_interval) + else: + return abs(self.start-span.start) + abs(self.end-span.end) + + def is_equal(self, span): + return self.start == span.start and self.end == span.end + + def overlaps(self, span): + start_interval = max(self.start, span.start) + end_interval = min(self.end, span.end) + return start_interval <= end_interval + + def print_span(self): + print (self.name, self.start, self.end) diff --git a/python/test.py b/python/test.py new file mode 100644 index 0000000..fe1f978 --- /dev/null +++ b/python/test.py @@ -0,0 +1,15 @@ +import nlp_pipeline +pipe = nlp_pipeline.NLPPipeline() + +text = 'I solved the problem with statistics.' +language = 'EN-Nonprojective' +conll_str = pipe.parse_conll(text, language ) +print(conll_str) + +text = 'Lisbon is the capital and the largest city of Portugal.' +sentences = pipe.split_sentences(text, language) +for sentence in sentences: + tokenized_sentence = pipe.tokenize(sentence, language) + tags, lemmas, feats = pipe.tag(tokenized_sentence, language) + entity_tags = pipe.recognize_entities(tokenized_sentence,tags, language) + print(entity_tags) \ No newline at end of file diff --git a/python/turbo_parser.pyx b/python/turboparser.pyx similarity index 97% rename from python/turbo_parser.pyx rename to python/turboparser.pyx index 3f3b460..5920c65 100644 --- a/python/turbo_parser.pyx +++ b/python/turboparser.pyx @@ -1,534 +1,534 @@ -from libcpp.string cimport string -from libcpp.vector cimport vector -from libcpp cimport bool - -import pdb - -# Get the classes from the c++ headers. - -cdef extern from "../src/sequence/SequenceInstance.h": - cdef cppclass SequenceInstance: - SequenceInstance() - void Initialize(vector[string] forms, vector[string] tags) - string GetTag(int i) - -cdef extern from "../src/morphological_tagger/MorphologicalInstance.h": - cdef cppclass MorphologicalInstance: - MorphologicalInstance() - void Initialize(vector[string] forms, vector[string] lemmas, \ - vector[string] pos, vector[string] tags) - string GetTag(int i) - -cdef extern from "../src/entity_recognizer/EntityInstance.h": - cdef cppclass EntityInstance: - EntityInstance() - void Initialize(vector[string] forms, vector[string] pos, \ - vector[string] tags) - string GetTag(int i) - -cdef extern from "../src/parser/DependencyInstance.h": - cdef cppclass DependencyInstance: - DependencyInstance() - void Initialize(vector[string] forms, vector[string] lemmas, \ - vector[string] cpos, vector[string] pos, \ - vector[vector[string]] feats, vector[string] deprels, \ - vector[int] heads) - int GetHead(int i) - string GetDependencyRelation(int i) - -cdef extern from "../src/semantic_parser/SemanticInstance.h": - cdef cppclass SemanticInstance: - SemanticInstance() - void Initialize(string name, vector[string] forms, vector[string] lemmas, \ - vector[string] cpos, vector[string] pos, \ - vector[vector[string]] feats, vector[string] deprels, \ - vector[int] heads, vector[string] predicate_names, \ - vector[int] predicate_indices, \ - vector[vector[string]] argument_roles, \ - vector[vector[int]] argument_indices) - int GetNumPredicates() - string GetPredicateName(int k) - int GetPredicateIndex(int k) - int GetNumArgumentsPredicate(int k) - string GetArgumentRole(int k, int l) - int GetArgumentIndex(int k, int l) - -cdef extern from "../src/entity_recognizer/EntitySpan.h": - ctypedef NamedSpan EntitySpan - cdef cppclass NamedSpan: - NamedSpan(int start, int end, string name) - int start() - int end() - string name() - -cdef extern from "../src/coreference_resolver/CoreferenceSentence.h": - cdef cppclass CoreferenceSentence: - CoreferenceSentence() - void Initialize(string name, vector[string] forms, vector[string] lemmas, \ - vector[string] cpos, vector[string] pos, \ - vector[vector[string]] feats, vector[string] deprels, \ - vector[int] heads, vector[string] predicate_names, \ - vector[int] predicate_indices, \ - vector[vector[string]] argument_roles, \ - vector[vector[int]] argument_indices, \ - vector[string] speakers, \ - vector[NamedSpan*] entity_spans, \ - vector[NamedSpan*] constituent_spans, \ - vector[NamedSpan*] coreference_spans) - vector[NamedSpan*] GetCoreferenceSpans() - -cdef extern from "../src/coreference_resolver/CoreferenceDocument.h": - cdef cppclass CoreferenceDocument: - CoreferenceDocument() - void Initialize(string name, int part_number, \ - vector[CoreferenceSentence*] sentences) - int GetNumSentences() - CoreferenceSentence *GetSentence(int i) - -cdef extern from "../libturboparser/TurboParserInterface.h" namespace "TurboParserInterface": - cdef cppclass TurboTaggerWorker: - TurboTaggerWorker() - void LoadTaggerModel(string file_model) - void Tag(string file_test, string file_prediction) - void TagSentence(SequenceInstance *sentence) - - cdef cppclass TurboMorphologicalTaggerWorker: - TurboMorphologicalTaggerWorker() - void LoadMorphologicalTaggerModel(string file_model) - void Tag(string file_test, string file_prediction) - void TagSentence(MorphologicalInstance *sentence) - - cdef cppclass TurboEntityRecognizerWorker: - TurboEntityRecognizerWorker() - void LoadEntityRecognizerModel(string file_model) - void Tag(string file_test, string file_prediction) - void TagSentence(EntityInstance *sentence) - - cdef cppclass TurboParserWorker: - TurboParserWorker() - void LoadParserModel(string file_model) - void Parse(string file_test, string file_prediction) - void ParseSentence(DependencyInstance *sentence) - - cdef cppclass TurboSemanticParserWorker: - TurboSemanticParserWorker() - void LoadSemanticParserModel(string file_model) - void ParseSemanticDependencies(string file_test, string file_prediction) - void ParseSemanticDependenciesFromSentence(SemanticInstance *sentence) - - cdef cppclass TurboCoreferenceResolverWorker: - TurboCoreferenceResolverWorker() - void LoadCoreferenceResolverModel(string file_model) - void ResolveCoreferences(string file_test, string file_prediction) - void ResolveCoreferencesFromDocument(CoreferenceDocument *document) - - cdef cppclass TurboParserInterface: - TurboParserInterface() - TurboTaggerWorker* CreateTagger() - TurboMorphologicalTaggerWorker* CreateMorphologicalTagger() - TurboEntityRecognizerWorker* CreateEntityRecognizer() - TurboParserWorker* CreateParser() - TurboSemanticParserWorker* CreateSemanticParser() - TurboCoreferenceResolverWorker* CreateCoreferenceResolver() - - -# Wrap them into python extension types. - -cdef class PTurboParser: - cdef TurboParserInterface *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new TurboParserInterface() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def create_tagger(self): - tagger = PTurboTaggerWorker(allocate=False) - tagger.thisptr = self.thisptr.CreateTagger() - return tagger - - def create_morphological_tagger(self): - morphological_tagger = PTurboMorphologicalTaggerWorker(allocate=False) - morphological_tagger.thisptr = self.thisptr.CreateMorphologicalTagger() - return morphological_tagger - - def create_entity_recognizer(self): - entity_recognizer = PTurboEntityRecognizerWorker(allocate=False) - entity_recognizer.thisptr = self.thisptr.CreateEntityRecognizer() - return entity_recognizer - - def create_parser(self): - parser = PTurboParserWorker(allocate=False) - parser.thisptr = self.thisptr.CreateParser() - return parser - - def create_semantic_parser(self): - semantic_parser = PTurboSemanticParserWorker(allocate=False) - semantic_parser.thisptr = self.thisptr.CreateSemanticParser() - return semantic_parser - - def create_coreference_resolver(self): - coreference_resolver = PTurboCoreferenceResolverWorker(allocate=False) - coreference_resolver.thisptr = self.thisptr.CreateCoreferenceResolver() - return coreference_resolver - -cdef class PSequenceInstance: - cdef SequenceInstance *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new SequenceInstance() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def initialize(self, vector[string] forms, vector[string] tags): - self.thisptr.Initialize(forms, tags) - - def get_tag(self, i): - return self.thisptr.GetTag(i) - -cdef class PMorphologicalInstance: - cdef MorphologicalInstance *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new MorphologicalInstance() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def initialize(self, vector[string] forms, vector[string] lemmas, \ - vector[string] pos, vector[string] tags): - self.thisptr.Initialize(forms, lemmas, pos, tags) - - def get_tag(self, i): - return self.thisptr.GetTag(i) - -cdef class PEntityInstance: - cdef EntityInstance *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new EntityInstance() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def initialize(self, vector[string] forms, vector[string] pos, \ - vector[string] tags): - self.thisptr.Initialize(forms, pos, tags) - - def get_tag(self, i): - return self.thisptr.GetTag(i) - -cdef class PDependencyInstance: - cdef DependencyInstance *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new DependencyInstance() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def initialize(self, vector[string] forms, vector[string] lemmas, \ - vector[string] cpos, vector[string] pos, \ - vector[vector[string]] feats, vector[string] deprels, \ - vector[int] heads): - self.thisptr.Initialize(forms, lemmas, cpos, pos, feats, deprels, heads) - - def get_head(self, i): - return self.thisptr.GetHead(i) - - def get_dependency_relation(self, i): - return self.thisptr.GetDependencyRelation(i) - -cdef class PSemanticInstance: - cdef SemanticInstance *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new SemanticInstance() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def initialize(self, string name, vector[string] forms, \ - vector[string] lemmas, \ - vector[string] cpos, vector[string] pos, \ - vector[vector[string]] feats, vector[string] deprels, \ - vector[int] heads, vector[string] predicate_names, \ - vector[int] predicate_indices, \ - vector[vector[string]] argument_roles, \ - vector[vector[int]] argument_indices): - self.thisptr.Initialize(name, forms, lemmas, cpos, pos, feats, \ - deprels, heads, predicate_names, \ - predicate_indices, argument_roles, \ - argument_indices) - - def get_num_predicates(self): - return self.thisptr.GetNumPredicates() - - def get_predicate_name(self, k): - return self.thisptr.GetPredicateName(k) - - def get_predicate_index(self, k): - return self.thisptr.GetPredicateIndex(k) - - def get_num_arguments_predicate(self, k): - return self.thisptr.GetNumArgumentsPredicate(k) - - def get_argument_role(self, k, l): - return self.thisptr.GetArgumentRole(k, l) - - def get_argument_index(self, k, l): - return self.thisptr.GetArgumentIndex(k, l) - -cdef class PNamedSpan: - cdef NamedSpan *thisptr - cdef bool allocate - def __cinit__(self, start=-1, end=-1, name='', allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new NamedSpan(start, end, name) - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def start(self): - return self.thisptr.start() - - def end(self): - return self.thisptr.end() - - def name(self): - return self.thisptr.name() - -cdef class PEntitySpan(PNamedSpan): - def __cinit__(self, start=-1, end=-1, name='', allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new EntitySpan(start, end, name) - -cdef class PCoreferenceSentence: - cdef CoreferenceSentence *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new CoreferenceSentence() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def initialize(self, string name, vector[string] forms, \ - vector[string] lemmas, \ - vector[string] cpos, vector[string] pos, \ - vector[vector[string]] feats, vector[string] deprels, \ - vector[int] heads, vector[string] predicate_names, \ - vector[int] predicate_indices, \ - vector[vector[string]] argument_roles, \ - vector[vector[int]] argument_indices, \ - vector[string] speakers, \ - p_entity_spans, \ - p_constituent_spans, \ - p_coreference_spans): - cdef vector[NamedSpan*] entity_spans - for p_span in p_entity_spans: - entity_spans.push_back((p_span).thisptr) - cdef vector[NamedSpan*] constituent_spans - for p_span in p_constituent_spans: - constituent_spans.push_back((p_span).thisptr) - cdef vector[NamedSpan*] coreference_spans - for p_span in p_coreference_spans: - coreference_spans.push_back((p_span).thisptr) - self.thisptr.Initialize(name, forms, lemmas, cpos, pos, feats, \ - deprels, heads, predicate_names, \ - predicate_indices, argument_roles, \ - argument_indices, speakers, entity_spans, \ - constituent_spans, coreference_spans) - - def get_coreference_spans(self): - cdef vector[NamedSpan*] coreference_spans = \ - self.thisptr.GetCoreferenceSpans() - p_coreference_spans = [] - for span in coreference_spans: - p_span = PNamedSpan(allocate=False) - p_span.thisptr = span - p_coreference_spans.append(p_span) - return p_coreference_spans - -cdef class PCoreferenceDocument: - cdef CoreferenceDocument *thisptr - cdef bool allocate - def __cinit__(self, allocate=True): - self.allocate = allocate - if allocate: - self.thisptr = new CoreferenceDocument() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def initialize(self, string name, int part_number, \ - p_sentences): - cdef vector[CoreferenceSentence*] sentences - for p_sentence in p_sentences: - sentences.push_back((p_sentence).thisptr) - self.thisptr.Initialize(name, part_number, sentences) - - def get_num_sentences(self): - self.thisptr.GetNumSentences() - - def get_sentence(self, int i): - cdef CoreferenceSentence *sentence = self.thisptr.GetSentence(i) - p_sentence = PCoreferenceSentence(allocate=False) - p_sentence.thisptr = sentence - return p_sentence - -cdef class PTurboTaggerWorker: - cdef TurboTaggerWorker *thisptr - cdef bool allocate - def __cinit__(self, allocate=False): - self.allocate = allocate - if allocate: - self.thisptr = new TurboTaggerWorker() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def load_tagger_model(self, file_model): - self.thisptr.LoadTaggerModel(file_model) - - def tag(self, file_test, file_prediction): - self.thisptr.Tag(file_test, file_prediction) - - def tag_sentence(self, sequence_instance): - self.thisptr.TagSentence((sequence_instance).thisptr) - -cdef class PTurboMorphologicalTaggerWorker: - cdef TurboMorphologicalTaggerWorker *thisptr - cdef bool allocate - def __cinit__(self, allocate=False): - self.allocate = allocate - if allocate: - self.thisptr = new TurboMorphologicalTaggerWorker() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def load_morphological_tagger_model(self, file_model): - self.thisptr.LoadMorphologicalTaggerModel(file_model) - - def tag(self, file_test, file_prediction): - self.thisptr.Tag(file_test, file_prediction) - - def tag_sentence(self, sequence_instance): - self.thisptr.TagSentence( \ - (sequence_instance).thisptr) - -cdef class PTurboEntityRecognizerWorker: - cdef TurboEntityRecognizerWorker *thisptr - cdef bool allocate - def __cinit__(self, allocate=False): - self.allocate = allocate - if allocate: - self.thisptr = new TurboEntityRecognizerWorker() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def load_entity_recognizer_model(self, file_model): - self.thisptr.LoadEntityRecognizerModel(file_model) - - def tag(self, file_test, file_prediction): - self.thisptr.Tag(file_test, file_prediction) - - def tag_sentence(self, entity_instance): - self.thisptr.TagSentence((entity_instance).thisptr) - -cdef class PTurboParserWorker: - cdef TurboParserWorker *thisptr - cdef bool allocate - def __cinit__(self, allocate=False): - self.allocate = allocate - if allocate: - self.thisptr = new TurboParserWorker() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def load_parser_model(self, file_model): - self.thisptr.LoadParserModel(file_model) - - def parse(self, file_test, file_prediction): - self.thisptr.Parse(file_test, file_prediction) - - def parse_sentence(self, dependency_instance): - self.thisptr.ParseSentence( \ - (dependency_instance).thisptr) - -cdef class PTurboSemanticParserWorker: - cdef TurboSemanticParserWorker *thisptr - cdef bool allocate - def __cinit__(self, allocate=False): - self.allocate = allocate - if allocate: - self.thisptr = new TurboSemanticParserWorker() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def load_semantic_parser_model(self, file_model): - self.thisptr.LoadSemanticParserModel(file_model) - - def parse_semantic_dependencies(self, file_test, file_prediction): - self.thisptr.ParseSemanticDependencies(file_test, file_prediction) - - def parse_semantic_dependencies_from_sentence(self, semantic_instance): - self.thisptr.ParseSemanticDependenciesFromSentence( \ - (semantic_instance).thisptr) - -cdef class PTurboCoreferenceResolverWorker: - cdef TurboCoreferenceResolverWorker *thisptr - cdef bool allocate - def __cinit__(self, allocate=False): - self.allocate = allocate - if allocate: - self.thisptr = new TurboCoreferenceResolverWorker() - - def __dealloc__(self): - if self.allocate: - del self.thisptr - - def load_coreference_resolver_model(self, file_model): - self.thisptr.LoadCoreferenceResolverModel(file_model) - - def resolve_coreferences(self, file_test, file_prediction): - self.thisptr.ResolveCoreferences(file_test, file_prediction) - - def resolve_coreferences_from_document(self, coreference_document): - self.thisptr.ResolveCoreferencesFromDocument( \ - (coreference_document).thisptr) +from libcpp.string cimport string +from libcpp.vector cimport vector +from libcpp cimport bool + +import pdb + +# Get the classes from the c++ headers. + +cdef extern from "../src/sequence/SequenceInstance.h": + cdef cppclass SequenceInstance: + SequenceInstance() + void Initialize(vector[string] forms, vector[string] tags) + string GetTag(int i) + +cdef extern from "../src/morphological_tagger/MorphologicalInstance.h": + cdef cppclass MorphologicalInstance: + MorphologicalInstance() + void Initialize(vector[string] forms, vector[string] lemmas, \ + vector[string] pos, vector[string] tags) + string GetTag(int i) + +cdef extern from "../src/entity_recognizer/EntityInstance.h": + cdef cppclass EntityInstance: + EntityInstance() + void Initialize(vector[string] forms, vector[string] pos, \ + vector[string] tags) + string GetTag(int i) + +cdef extern from "../src/parser/DependencyInstance.h": + cdef cppclass DependencyInstance: + DependencyInstance() + void Initialize(vector[string] forms, vector[string] lemmas, \ + vector[string] cpos, vector[string] pos, \ + vector[vector[string]] feats, vector[string] deprels, \ + vector[int] heads) + int GetHead(int i) + string GetDependencyRelation(int i) + +cdef extern from "../src/semantic_parser/SemanticInstance.h": + cdef cppclass SemanticInstance: + SemanticInstance() + void Initialize(string name, vector[string] forms, vector[string] lemmas, \ + vector[string] cpos, vector[string] pos, \ + vector[vector[string]] feats, vector[string] deprels, \ + vector[int] heads, vector[string] predicate_names, \ + vector[int] predicate_indices, \ + vector[vector[string]] argument_roles, \ + vector[vector[int]] argument_indices) + int GetNumPredicates() + string GetPredicateName(int k) + int GetPredicateIndex(int k) + int GetNumArgumentsPredicate(int k) + string GetArgumentRole(int k, int l) + int GetArgumentIndex(int k, int l) + +cdef extern from "../src/entity_recognizer/EntitySpan.h": + ctypedef NamedSpan EntitySpan + cdef cppclass NamedSpan: + NamedSpan(int start, int end, string name) + int start() + int end() + string name() + +cdef extern from "../src/coreference_resolver/CoreferenceSentence.h": + cdef cppclass CoreferenceSentence: + CoreferenceSentence() + void Initialize(string name, vector[string] forms, vector[string] lemmas, \ + vector[string] cpos, vector[string] pos, \ + vector[vector[string]] feats, vector[string] deprels, \ + vector[int] heads, vector[string] predicate_names, \ + vector[int] predicate_indices, \ + vector[vector[string]] argument_roles, \ + vector[vector[int]] argument_indices, \ + vector[string] speakers, \ + vector[NamedSpan*] entity_spans, \ + vector[NamedSpan*] constituent_spans, \ + vector[NamedSpan*] coreference_spans) + vector[NamedSpan*] GetCoreferenceSpans() + +cdef extern from "../src/coreference_resolver/CoreferenceDocument.h": + cdef cppclass CoreferenceDocument: + CoreferenceDocument() + void Initialize(string name, int part_number, \ + vector[CoreferenceSentence*] sentences) + int GetNumSentences() + CoreferenceSentence *GetSentence(int i) + +cdef extern from "../libturboparser/TurboParserInterface.h" namespace "TurboParserInterface": + cdef cppclass TurboTaggerWorker: + TurboTaggerWorker() + void LoadTaggerModel(string file_model) + void Tag(string file_test, string file_prediction) + void TagSentence(SequenceInstance *sentence) + + cdef cppclass TurboMorphologicalTaggerWorker: + TurboMorphologicalTaggerWorker() + void LoadMorphologicalTaggerModel(string file_model) + void Tag(string file_test, string file_prediction) + void TagSentence(MorphologicalInstance *sentence) + + cdef cppclass TurboEntityRecognizerWorker: + TurboEntityRecognizerWorker() + void LoadEntityRecognizerModel(string file_model) + void Tag(string file_test, string file_prediction) + void TagSentence(EntityInstance *sentence) + + cdef cppclass TurboParserWorker: + TurboParserWorker() + void LoadParserModel(string file_model) + void Parse(string file_test, string file_prediction) + void ParseSentence(DependencyInstance *sentence) + + cdef cppclass TurboSemanticParserWorker: + TurboSemanticParserWorker() + void LoadSemanticParserModel(string file_model) + void ParseSemanticDependencies(string file_test, string file_prediction) + void ParseSemanticDependenciesFromSentence(SemanticInstance *sentence) + + cdef cppclass TurboCoreferenceResolverWorker: + TurboCoreferenceResolverWorker() + void LoadCoreferenceResolverModel(string file_model) + void ResolveCoreferences(string file_test, string file_prediction) + void ResolveCoreferencesFromDocument(CoreferenceDocument *document) + + cdef cppclass TurboParserInterface: + TurboParserInterface() + TurboTaggerWorker* CreateTagger() + TurboMorphologicalTaggerWorker* CreateMorphologicalTagger() + TurboEntityRecognizerWorker* CreateEntityRecognizer() + TurboParserWorker* CreateParser() + TurboSemanticParserWorker* CreateSemanticParser() + TurboCoreferenceResolverWorker* CreateCoreferenceResolver() + + +# Wrap them into python extension types. + +cdef class PTurboParser: + cdef TurboParserInterface *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new TurboParserInterface() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def create_tagger(self): + tagger = PTurboTaggerWorker(allocate=False) + tagger.thisptr = self.thisptr.CreateTagger() + return tagger + + def create_morphological_tagger(self): + morphological_tagger = PTurboMorphologicalTaggerWorker(allocate=False) + morphological_tagger.thisptr = self.thisptr.CreateMorphologicalTagger() + return morphological_tagger + + def create_entity_recognizer(self): + entity_recognizer = PTurboEntityRecognizerWorker(allocate=False) + entity_recognizer.thisptr = self.thisptr.CreateEntityRecognizer() + return entity_recognizer + + def create_parser(self): + parser = PTurboParserWorker(allocate=False) + parser.thisptr = self.thisptr.CreateParser() + return parser + + def create_semantic_parser(self): + semantic_parser = PTurboSemanticParserWorker(allocate=False) + semantic_parser.thisptr = self.thisptr.CreateSemanticParser() + return semantic_parser + + def create_coreference_resolver(self): + coreference_resolver = PTurboCoreferenceResolverWorker(allocate=False) + coreference_resolver.thisptr = self.thisptr.CreateCoreferenceResolver() + return coreference_resolver + +cdef class PSequenceInstance: + cdef SequenceInstance *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new SequenceInstance() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def initialize(self, vector[string] forms, vector[string] tags): + self.thisptr.Initialize(forms, tags) + + def get_tag(self, i): + return self.thisptr.GetTag(i) + +cdef class PMorphologicalInstance: + cdef MorphologicalInstance *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new MorphologicalInstance() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def initialize(self, vector[string] forms, vector[string] lemmas, \ + vector[string] pos, vector[string] tags): + self.thisptr.Initialize(forms, lemmas, pos, tags) + + def get_tag(self, i): + return self.thisptr.GetTag(i) + +cdef class PEntityInstance: + cdef EntityInstance *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new EntityInstance() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def initialize(self, vector[string] forms, vector[string] pos, \ + vector[string] tags): + self.thisptr.Initialize(forms, pos, tags) + + def get_tag(self, i): + return self.thisptr.GetTag(i) + +cdef class PDependencyInstance: + cdef DependencyInstance *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new DependencyInstance() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def initialize(self, vector[string] forms, vector[string] lemmas, \ + vector[string] cpos, vector[string] pos, \ + vector[vector[string]] feats, vector[string] deprels, \ + vector[int] heads): + self.thisptr.Initialize(forms, lemmas, cpos, pos, feats, deprels, heads) + + def get_head(self, i): + return self.thisptr.GetHead(i) + + def get_dependency_relation(self, i): + return self.thisptr.GetDependencyRelation(i) + +cdef class PSemanticInstance: + cdef SemanticInstance *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new SemanticInstance() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def initialize(self, string name, vector[string] forms, \ + vector[string] lemmas, \ + vector[string] cpos, vector[string] pos, \ + vector[vector[string]] feats, vector[string] deprels, \ + vector[int] heads, vector[string] predicate_names, \ + vector[int] predicate_indices, \ + vector[vector[string]] argument_roles, \ + vector[vector[int]] argument_indices): + self.thisptr.Initialize(name, forms, lemmas, cpos, pos, feats, \ + deprels, heads, predicate_names, \ + predicate_indices, argument_roles, \ + argument_indices) + + def get_num_predicates(self): + return self.thisptr.GetNumPredicates() + + def get_predicate_name(self, k): + return self.thisptr.GetPredicateName(k) + + def get_predicate_index(self, k): + return self.thisptr.GetPredicateIndex(k) + + def get_num_arguments_predicate(self, k): + return self.thisptr.GetNumArgumentsPredicate(k) + + def get_argument_role(self, k, l): + return self.thisptr.GetArgumentRole(k, l) + + def get_argument_index(self, k, l): + return self.thisptr.GetArgumentIndex(k, l) + +cdef class PNamedSpan: + cdef NamedSpan *thisptr + cdef bool allocate + def __cinit__(self, start=-1, end=-1, name='', allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new NamedSpan(start, end, name) + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def start(self): + return self.thisptr.start() + + def end(self): + return self.thisptr.end() + + def name(self): + return self.thisptr.name() + +cdef class PEntitySpan(PNamedSpan): + def __cinit__(self, start=-1, end=-1, name='', allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new EntitySpan(start, end, name) + +cdef class PCoreferenceSentence: + cdef CoreferenceSentence *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new CoreferenceSentence() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def initialize(self, string name, vector[string] forms, \ + vector[string] lemmas, \ + vector[string] cpos, vector[string] pos, \ + vector[vector[string]] feats, vector[string] deprels, \ + vector[int] heads, vector[string] predicate_names, \ + vector[int] predicate_indices, \ + vector[vector[string]] argument_roles, \ + vector[vector[int]] argument_indices, \ + vector[string] speakers, \ + p_entity_spans, \ + p_constituent_spans, \ + p_coreference_spans): + cdef vector[EntitySpan*] entity_spans + for p_span in p_entity_spans: + entity_spans.push_back((p_span).thisptr) + cdef vector[NamedSpan*] constituent_spans + for p_span in p_constituent_spans: + constituent_spans.push_back((p_span).thisptr) + cdef vector[NamedSpan*] coreference_spans + for p_span in p_coreference_spans: + coreference_spans.push_back((p_span).thisptr) + self.thisptr.Initialize(name, forms, lemmas, cpos, pos, feats, \ + deprels, heads, predicate_names, \ + predicate_indices, argument_roles, \ + argument_indices, speakers, entity_spans, \ + constituent_spans, coreference_spans) + + def get_coreference_spans(self): + cdef vector[NamedSpan*] coreference_spans = \ + self.thisptr.GetCoreferenceSpans() + p_coreference_spans = [] + for span in coreference_spans: + p_span = PNamedSpan(allocate=False) + p_span.thisptr = span + p_coreference_spans.append(p_span) + return p_coreference_spans + +cdef class PCoreferenceDocument: + cdef CoreferenceDocument *thisptr + cdef bool allocate + def __cinit__(self, allocate=True): + self.allocate = allocate + if allocate: + self.thisptr = new CoreferenceDocument() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def initialize(self, string name, int part_number, \ + p_sentences): + cdef vector[CoreferenceSentence*] sentences + for p_sentence in p_sentences: + sentences.push_back((p_sentence).thisptr) + self.thisptr.Initialize(name, part_number, sentences) + + def get_num_sentences(self): + self.thisptr.GetNumSentences() + + def get_sentence(self, int i): + cdef CoreferenceSentence *sentence = self.thisptr.GetSentence(i) + p_sentence = PCoreferenceSentence(allocate=False) + p_sentence.thisptr = sentence + return p_sentence + +cdef class PTurboTaggerWorker: + cdef TurboTaggerWorker *thisptr + cdef bool allocate + def __cinit__(self, allocate=False): + self.allocate = allocate + if allocate: + self.thisptr = new TurboTaggerWorker() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def load_tagger_model(self, file_model): + self.thisptr.LoadTaggerModel(file_model) + + def tag(self, file_test, file_prediction): + self.thisptr.Tag(file_test, file_prediction) + + def tag_sentence(self, sequence_instance): + self.thisptr.TagSentence((sequence_instance).thisptr) + +cdef class PTurboMorphologicalTaggerWorker: + cdef TurboMorphologicalTaggerWorker *thisptr + cdef bool allocate + def __cinit__(self, allocate=False): + self.allocate = allocate + if allocate: + self.thisptr = new TurboMorphologicalTaggerWorker() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def load_morphological_tagger_model(self, file_model): + self.thisptr.LoadMorphologicalTaggerModel(file_model) + + def tag(self, file_test, file_prediction): + self.thisptr.Tag(file_test, file_prediction) + + def tag_sentence(self, sequence_instance): + self.thisptr.TagSentence( \ + (sequence_instance).thisptr) + +cdef class PTurboEntityRecognizerWorker: + cdef TurboEntityRecognizerWorker *thisptr + cdef bool allocate + def __cinit__(self, allocate=False): + self.allocate = allocate + if allocate: + self.thisptr = new TurboEntityRecognizerWorker() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def load_entity_recognizer_model(self, file_model): + self.thisptr.LoadEntityRecognizerModel(file_model) + + def tag(self, file_test, file_prediction): + self.thisptr.Tag(file_test, file_prediction) + + def tag_sentence(self, entity_instance): + self.thisptr.TagSentence((entity_instance).thisptr) + +cdef class PTurboParserWorker: + cdef TurboParserWorker *thisptr + cdef bool allocate + def __cinit__(self, allocate=False): + self.allocate = allocate + if allocate: + self.thisptr = new TurboParserWorker() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def load_parser_model(self, file_model): + self.thisptr.LoadParserModel(file_model) + + def parse(self, file_test, file_prediction): + self.thisptr.Parse(file_test, file_prediction) + + def parse_sentence(self, dependency_instance): + self.thisptr.ParseSentence( \ + (dependency_instance).thisptr) + +cdef class PTurboSemanticParserWorker: + cdef TurboSemanticParserWorker *thisptr + cdef bool allocate + def __cinit__(self, allocate=False): + self.allocate = allocate + if allocate: + self.thisptr = new TurboSemanticParserWorker() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def load_semantic_parser_model(self, file_model): + self.thisptr.LoadSemanticParserModel(file_model) + + def parse_semantic_dependencies(self, file_test, file_prediction): + self.thisptr.ParseSemanticDependencies(file_test, file_prediction) + + def parse_semantic_dependencies_from_sentence(self, semantic_instance): + self.thisptr.ParseSemanticDependenciesFromSentence( \ + (semantic_instance).thisptr) + +cdef class PTurboCoreferenceResolverWorker: + cdef TurboCoreferenceResolverWorker *thisptr + cdef bool allocate + def __cinit__(self, allocate=False): + self.allocate = allocate + if allocate: + self.thisptr = new TurboCoreferenceResolverWorker() + + def __dealloc__(self): + if self.allocate: + del self.thisptr + + def load_coreference_resolver_model(self, file_model): + self.thisptr.LoadCoreferenceResolverModel(file_model) + + def resolve_coreferences(self, file_test, file_prediction): + self.thisptr.ResolveCoreferences(file_test, file_prediction) + + def resolve_coreferences_from_document(self, coreference_document): + self.thisptr.ResolveCoreferencesFromDocument( \ + (coreference_document).thisptr) \ No newline at end of file diff --git a/scripts_morph/morph_script_test_parameters.py b/scripts_morph/morph_script_test_parameters.py index 177349a..a6e09dc 100644 --- a/scripts_morph/morph_script_test_parameters.py +++ b/scripts_morph/morph_script_test_parameters.py @@ -3,13 +3,14 @@ import os import subprocess import time +import itertools dir = os.path.abspath( os.path.dirname(__file__) ) -NumberOfRuns=1 -NumberWarmUps=0 +number_of_runs=1 +number_of_warmups=0 -RUNNING_TEST=1 +running_test=1 if len(sys.argv) == 2: Programs = [sys.argv[1]] @@ -28,7 +29,31 @@ sys.exit() -Languages = ['basque', +timestr = time.strftime("%Y%m%d-%H%M%S") +output_log_filename_prefix = "*T**__LANGUAGE__*turbo_morphtagger_run" +output_log_filename_sufix = timestr +output_log_folder = [os.path.join(dir, '..','data_local','morph_log')] + +train_files_Folder = [os.path.join(dir, '..','data_local','morph_data')] +dev_files_Folder = [os.path.join(dir, '..','data_local','morph_data')] +test_files_Folder = [os.path.join(dir, '..','data_local','morph_data')] +model_files_Folder = [os.path.join(dir, '..','data_local','morph_models')] +prediction_files_Folder = [os.path.join(dir, '..','data_local','morph_out')] + +train_files_template = '*__LANGUAGE__*-ud-train.conllu' +dev_files_template = '*__LANGUAGE__*-ud-dev.conllu' +test_files_template = '*__LANGUAGE__*-ud-test.conllu' +model_files_template = '*T**__LANGUAGE__*_morphtagger.model_mo*__MARKOV_ORDER__*_feat*__FEATURES__*_trc*__REGCONST__*_p*__PREFIX__*s*__SUFFIX__*' +prediction_files_template = '*T**__LANGUAGE__*_morphtagger.model_mo*__MARKOV_ORDER__*_feat*__FEATURES__*_trc*__REGCONST__*_p*__PREFIX__*s*__SUFFIX__*.pred' + +train_files = [] +dev_files = [] +test_files = [] +model_files = [] +prediction_files = [] + + +languages = ['basque', 'bulgarian', 'croatian', 'czech', @@ -38,301 +63,297 @@ 'greek', 'hungarian', 'italian', -'swedish' -] - -TrainFiles_template = [os.path.join(dir, '..','data_local','morph_data','*__LANGUAGE__*-ud-train.conllu')] #Replace *__LANGUAGE__* by Languages[i] -DevFiles_template = [os.path.join(dir, '..','data_local','morph_data','*__LANGUAGE__*-ud-dev.conllu')] #Replace *__LANGUAGE__* by Languages[i] -ModelFiles_template = [os.path.join(dir, '..','data_local','morph_models','*T**__LANGUAGE__*_morphtagger.model_mo*__MARKOV_ORDER__*_feat*__FEATURES__*_trc*__REGCONST__*_p*__PREFIX__*s*__SUFFIX__*')] #Replace '*__LANGUAGE__*','__MARKOV_ORDER__*' '*__FEATURES__*', '*__REGCONST__*', '*__PREFIX__*', '*__SUFFIX__*' -PredictionFiles_template = [os.path.join(dir, '..','data_local','morph_out','*T**__LANGUAGE__*-train-dev.morphtagger.model_mo*__MARKOV_ORDER__*_feat*__FEATURES__*_trc*__REGCONST__*_p*__PREFIX__*s*__SUFFIX__*.predicted')] -TrainFiles = [] -DevFiles = [] -ModelFiles = [] -PredictionFiles = [] - -MorphFeatureSelection = ['0'] #['0', '1', '2'] #--morph_tagger_large_feature_set=0 -TrainAlgorithm = ['svm_mira'] #--train_algorithm=svm_mira -TrainRegularizationConstant = ['0.01'] #['1.0', '0.1', '0.01'] #--train_regularization_constant=0.01 -TrainEpochs = ['20'] #--train_epochs=20 -SequenceModelType = ['0'] #--sequence_model_type=0 -FormCutoff = ['0'] #--form_cutoff=0 -PrefixLength = ['0'] #['0','2','3'] #--prefix_length=3 -SuffixLength = ['3'] #['0','2','3'] #--suffix_length=3 +'swedish'] +morph_features_picker = ['0'] #['0', '1', '2'] #--morph_tagger_large_feature_set=0 +train_algorithms = ['svm_mira'] #--train_algorithm=svm_mira +train_regularization_constants = ['0.01'] #['1.0', '0.1', '0.01'] #--train_regularization_constant=0.01 +train_epochs_picker = ['20'] #--train_epochs=20 +sequence_model_types = ['0'] #--sequence_model_type=0 +form_cutoffs = ['0'] #--form_cutoff=0 +prefix_lengths = ['0'] #['0','2','3'] #--prefix_length=3 +suffix_lengths = ['3'] #['0','2','3'] #--suffix_length=3 #--logtostderr -OutputDesiredPrefix = "*T*turbo_morphological_tagger_run" -OutputFolderBenchmarks = [os.path.join(dir, '..','data_local','morph_log')] - -print OutputFolderBenchmarks[0] -if not os.path.exists(OutputFolderBenchmarks[0]): - os.makedirs(OutputFolderBenchmarks[0]) +if running_test == 1: + model_files_template = model_files_template.replace('*T*', "TEST_") + prediction_files_template = prediction_files_template.replace('*T*', "TEST_") + output_log_filename_prefix = output_log_filename_prefix.replace('*T*', "TEST_") +else: + model_files_template = model_files_template.replace('*T*', "") + prediction_files_template = prediction_files_template.replace('*T*', "") + output_log_filename_prefix = output_log_filename_prefix.replace('*T*', "") -if RUNNING_TEST == 1: - TrainFiles_template[0] = TrainFiles_template[0].replace('*T*', "TEST_") - DevFiles_template[0] = DevFiles_template[0].replace('*T*', "TEST_") - ModelFiles_template[0] = ModelFiles_template[0].replace('*T*', "TEST_") - PredictionFiles_template[0] = PredictionFiles_template[0].replace('*T*', "TEST_") - OutputDesiredPrefix = OutputDesiredPrefix.replace('*T*', "TEST_") +if len(languages) == 1: + output_log_filename_prefix = output_log_filename_prefix.replace('*__LANGUAGE__*', languages[0]) else: - TrainFiles_template[0] = TrainFiles_template[0].replace('*T*', "") - DevFiles_template[0] = DevFiles_template[0].replace('*T*', "") - ModelFiles_template[0] = ModelFiles_template[0].replace('*T*', "") - PredictionFiles_template[0] = PredictionFiles_template[0].replace('*T*', "") - OutputDesiredPrefix = OutputDesiredPrefix.replace('*T*', "") + output_log_filename_prefix = output_log_filename_prefix.replace('*__LANGUAGE__*', "") + +if not os.path.exists(output_log_folder[0]): + os.makedirs(output_log_folder[0]) +for language in languages: + temp_folder = model_files_Folder[0].replace('*__LANGUAGE__*', language) + if not os.path.exists(temp_folder): + os.makedirs(temp_folder) + temp_folder = prediction_files_Folder[0].replace('*__LANGUAGE__*', language) + if not os.path.exists(temp_folder): + os.makedirs(temp_folder) + +csv = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".csv") ,"wb") +log = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".log") ,"wb") +err = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".err") ,"wb") +pylog = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".pylog") ,"wb") -timestr = time.strftime("%Y%m%d-%H%M%S") -csv = open( os.path.join( OutputFolderBenchmarks[0],OutputDesiredPrefix+timestr+".csv") ,"wb") -log = open( os.path.join( OutputFolderBenchmarks[0],OutputDesiredPrefix+timestr+".log") ,"wb") -err = open( os.path.join( OutputFolderBenchmarks[0],OutputDesiredPrefix+timestr+".err") ,"wb") -pylog = open( os.path.join( OutputFolderBenchmarks[0],OutputDesiredPrefix+timestr+".pylog") ,"wb") +for i in range(len(train_files_Folder)) : + train_files.append(os.path.join( train_files_Folder[i], train_files_template)) +for i in range(len(dev_files_Folder)) : + dev_files.append(os.path.join( dev_files_Folder[i], dev_files_template)) +for i in range(len(test_files_Folder)) : + test_files.append(os.path.join( test_files_Folder[i], test_files_template)) +for i in range(len(model_files_Folder)) : + model_files.append(os.path.join( model_files_Folder[i], model_files_template)) +for i in range(len(prediction_files_Folder)) : + prediction_files.append(os.path.join( prediction_files_Folder[i], prediction_files_template)) string_to_write="" -string_to_write=string_to_write+"Program, Language, Features, Markov Order, Train Algorithm, Regularization Constant, Train Epochs, Form cutoff, Prefix Length, Suffix Length" -#string_to_write=string_to_write+", "+"Training time" #Commented for Windows execution; turn on in Linux environment -if NumberOfRuns == 1: - #string_to_write=string_to_write+", "+"Run(Test) time" #Commented for Windows execution; turn on in Linux environment - string_to_write=string_to_write+", "+"CorrectPredict" - string_to_write=string_to_write+", "+"Accuracy" - string_to_write=string_to_write+", "+"Speed (token/sec)" +string_to_write=string_to_write+"Program; Language; Features; Markov Order; Train Algorithm; Regularization Constant; Train Epochs; Form cutoff; Prefix Length; Suffix Length" +#string_to_write=string_to_write+"; "+"Training time" #Commented for Windows execution; turn on in Linux environment +if number_of_runs == 1: + #string_to_write=string_to_write+"; "+"Run(Test) time" #Commented for Windows execution; turn on in Linux environment + string_to_write=string_to_write+"; "+"CorrectPredict" + string_to_write=string_to_write+"; "+"Accuracy" + string_to_write=string_to_write+"; "+"Speed (token/sec)" else: - for i in range(NumberOfRuns): - #string_to_write=string_to_write+", "+"Run(Test) time["+str(i)+"]" #Commented for Windows execution; turn on in Linux environment - string_to_write=string_to_write+", "+"CorrectPredict["+str(i)+"]" - string_to_write=string_to_write+", "+"Accuracy["+str(i)+"]" - string_to_write=string_to_write+", "+"Speed["+str(i)+"] (token/sec)" + for i in range(number_of_runs): + #string_to_write=string_to_write+"; "+"Run(Test) time["+str(i)+"]" #Commented for Windows execution; turn on in Linux environment + string_to_write=string_to_write+"; "+"CorrectPredict["+str(i)+"]" + string_to_write=string_to_write+"; "+"Accuracy["+str(i)+"]" + string_to_write=string_to_write+"; "+"Speed["+str(i)+"] (token/sec)" string_to_write=string_to_write+"\n" csv.write(string_to_write) - -for program in Programs: - for language in Languages: - for features in MorphFeatureSelection: - for sequence_model_type in SequenceModelType: - for train_algorithm in TrainAlgorithm: - for train_regularization_constant in TrainRegularizationConstant: - for train_epochs in TrainEpochs: - for form_cutoff in FormCutoff: - for prefix_length in PrefixLength: - for suffix_length in SuffixLength: - - TrainFile = TrainFiles_template[0] - TrainFile = TrainFile.replace('*__LANGUAGE__*', language) - DevFile = DevFiles_template[0] - DevFile = DevFile.replace('*__LANGUAGE__*', language) - - ModelFile = ModelFiles_template[0] - ModelFile = ModelFile.replace('*__LANGUAGE__*', language) - ModelFile = ModelFile.replace('*__MARKOV_ORDER__*', sequence_model_type) - ModelFile = ModelFile.replace('*__FEATURES__*', features) - ModelFile = ModelFile.replace('*__REGCONST__*', train_regularization_constant) - ModelFile = ModelFile.replace('*__PREFIX__*', prefix_length) - ModelFile = ModelFile.replace('*__SUFFIX__*', suffix_length) - - PredictionFile = PredictionFiles_template[0] - PredictionFile = PredictionFile.replace('*__LANGUAGE__*', language) - PredictionFile = PredictionFile.replace('*__MARKOV_ORDER__*', sequence_model_type) - PredictionFile = PredictionFile.replace('*__FEATURES__*', features) - PredictionFile = PredictionFile.replace('*__REGCONST__*', train_regularization_constant) - PredictionFile = PredictionFile.replace('*__PREFIX__*', prefix_length) - PredictionFile = PredictionFile.replace('*__SUFFIX__*', suffix_length) - - #TRAIN - command = [] - #command.append("time -p") #Commented for Windows execution; turn on in Linux environment - command.append(program) - command.append("--train") - command.append("--file_train="+TrainFile) - command.append("--file_model="+ModelFile) - command.append("--train_algorithm="+train_algorithm) - command.append("--train_regularization_constant="+train_regularization_constant) - command.append("--train_epochs="+train_epochs) - command.append("--sequence_model_type="+sequence_model_type) - command.append("--form_cutoff="+form_cutoff) - command.append("--prefix_length="+prefix_length) - command.append("--suffix_length="+suffix_length) - command.append("--morph_tagger_large_feature_set="+features) - command.append("--logtostderr") +for program, language, features, sequence_model_type, train_algorithm, train_regularization_constant, train_epochs, form_cutoff, prefix_length, suffix_length in itertools.product(Programs, languages, morph_features_picker, sequence_model_types, train_algorithms, train_regularization_constants, train_epochs_picker, form_cutoffs, prefix_lengths, suffix_lengths): + train_file = train_files[0] + train_file = train_file.replace('*__LANGUAGE__*', language) + + dev_file = dev_files[0] + dev_file = dev_file.replace('*__LANGUAGE__*', language) + + test_file = test_files[0] + test_file = test_file.replace('*__LANGUAGE__*', language) + + model_file = model_files[0] + model_file = model_file.replace('*__LANGUAGE__*', language) + model_file = model_file.replace('*__MARKOV_ORDER__*', sequence_model_type) + model_file = model_file.replace('*__FEATURES__*', features) + model_file = model_file.replace('*__REGCONST__*', train_regularization_constant) + model_file = model_file.replace('*__PREFIX__*', prefix_length) + model_file = model_file.replace('*__SUFFIX__*', suffix_length) + + prediction_file = prediction_files[0] + prediction_file = prediction_file.replace('*__LANGUAGE__*', language) + prediction_file = prediction_file.replace('*__MARKOV_ORDER__*', sequence_model_type) + prediction_file = prediction_file.replace('*__FEATURES__*', features) + prediction_file = prediction_file.replace('*__REGCONST__*', train_regularization_constant) + prediction_file = prediction_file.replace('*__PREFIX__*', prefix_length) + prediction_file = prediction_file.replace('*__SUFFIX__*', suffix_length) + + #TRAIN + command = [] + #command.append("time -p") #Commented for Windows execution; turn on in Linux environment + command.append(program) + command.append("--train") + command.append("--file_train="+train_file) + command.append("--file_model="+model_file) + command.append("--train_algorithm="+train_algorithm) + command.append("--train_regularization_constant="+train_regularization_constant) + command.append("--train_epochs="+train_epochs) + command.append("--sequence_model_type="+sequence_model_type) + command.append("--form_cutoff="+form_cutoff) + command.append("--prefix_length="+prefix_length) + command.append("--suffix_length="+suffix_length) + command.append("--morph_tagger_large_feature_set="+features) + command.append("--logtostderr") + + print "Executing: " + sys.stdout.flush() + + print ' '.join(command) + sys.stdout.flush() + pylog.write(' '.join(command) + "\n") + log.write( ' '.join(command) + "\n") + err.write( ' '.join(command) + "\n") + #run program + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdout__output,stderr_output) = process.communicate() + print "Finished executing: " + ' '.join(command) + sys.stdout.flush() + + # print "- - - * * * - - -" + stdout__output = stdout__output.splitlines() + for line in stdout__output: + log.write(line + "\n") + # print line.rstrip("\n") + + # print "- - - * * * - - -" + stderr_output = stderr_output.splitlines() + for line in stderr_output: + err.write(line + "\n") + # print line.rstrip("\n") + where=line.find("Training took ") + if where != -1: + print line[where:] + sys.stdout.flush() + training_time = float(line[where+len("Training took "):len(line)-len(" sec.")]) + print "Training time = "+str(training_time)+" seconds\n" + sys.stdout.flush() + pylog.write("Training time = "+str(training_time)+" seconds\n") + log.write( "Training time = "+str(training_time)+" seconds\n") + err.write( "Training time = "+str(training_time)+" seconds\n") - print "Executing: " - sys.stdout.flush() + if line[0:4]=="real": + train_time = float(line[5:]) + print "time of execution = "+str(train_time)+" seconds\n" + sys.stdout.flush() + pylog.write("time of execution = "+str(train_time)+" seconds\n") + log.write( "time of execution = "+str(train_time)+" seconds\n") + err.write( "time of execution = "+str(train_time)+" seconds\n") + # print line + + #TEST + command = [] + #command.append("time -p") #Commented for Windows execution; turn on in Linux environment + command.append(program) + command.append("--test") + command.append("--evaluate") + command.append("--file_model="+model_file) + command.append("--file_test="+dev_file) + command.append("--file_prediction="+prediction_file) + command.append("--logtostderr") - print ' '.join(command) - sys.stdout.flush() - pylog.write(' '.join(command) + "\n") - log.write( ' '.join(command) + "\n") - err.write( ' '.join(command) + "\n") - #run program - process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdout__output,stderr_output) = process.communicate() - print "Finished executing: " + ' '.join(command) - sys.stdout.flush() - - # print "- - - * * * - - -" - stdout__output = stdout__output.splitlines() - for line in stdout__output: - log.write(line + "\n") - # print line.rstrip("\n") - - # print "- - - * * * - - -" - stderr_output = stderr_output.splitlines() - for line in stderr_output: - err.write(line + "\n") - # print line.rstrip("\n") - where=line.find("Training took ") - if where != -1: - print line[where:] - sys.stdout.flush() - training_time = float(line[where+len("Training took "):len(line)-len(" sec.")]) - print "Training time = "+str(training_time)+" seconds\n" - sys.stdout.flush() - pylog.write("Training time = "+str(training_time)+" seconds\n") - log.write( "Training time = "+str(training_time)+" seconds\n") - err.write( "Training time = "+str(training_time)+" seconds\n") + print "Executing: " + sys.stdout.flush() - if line[0:4]=="real": - train_time = float(line[5:]) - print "time of execution = "+str(train_time)+" seconds\n" - sys.stdout.flush() - pylog.write("time of execution = "+str(train_time)+" seconds\n") - log.write( "time of execution = "+str(train_time)+" seconds\n") - err.write( "time of execution = "+str(train_time)+" seconds\n") - # print line - - #TEST - command = [] - #command.append("time -p") #Commented for Windows execution; turn on in Linux environment - command.append(program) - command.append("--test") - command.append("--evaluate") - command.append("--file_model="+ModelFile) - command.append("--file_test="+DevFile) - command.append("--file_prediction="+PredictionFile) - command.append("--logtostderr") + #warm-up X iterations + for iteration in range(0,number_of_warmups): + print "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + sys.stdout.flush() + pylog.write("Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") + log.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") + err.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") + #run program + process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdout__output,stderr_output) = process.communicate() + print "Finished executing: " + ' '.join(command) + sys.stdout.flush() - print "Executing: " - sys.stdout.flush() - - #warm-up X iterations - for iteration in range(0,NumberWarmUps): - print "Warm-up #" + str(iteration+1) +": " + ' '.join(command) - sys.stdout.flush() - pylog.write("Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") - log.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") - err.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") - #run program - process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdout__output,stderr_output) = process.communicate() - print "Finished executing: " + ' '.join(command) - sys.stdout.flush() - - test_time=[] - correct_predictions=[] - accuracy=[] - tagspeed=[] - for iteration in range(NumberOfRuns): - print "\n" - pylog.write("\n") - log.write( "\n") - err.write( "\n") - if NumberOfRuns > 1: - print "\n**** ITER "+str(iteration)+" ****\n" - sys.stdout.flush() - pylog.write("\n**** ITER "+str(iteration)+" ****\n") - log.write( "\n**** ITER "+str(iteration)+" ****\n") - err.write( "\n**** ITER "+str(iteration)+" ****\n") - print ' '.join(command) - sys.stdout.flush() - pylog.write(' '.join(command) + "\n") - log.write( ' '.join(command) + "\n") - err.write( ' '.join(command) + "\n") - #run program - process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdout__output,stderr_output) = process.communicate() - print "Finished executing: " + ' '.join(command) - sys.stdout.flush() - - # print "- - - * * * - - -" - stdout__output = stdout__output.splitlines() - for line in stdout__output: - log.write(line + "\n") - # print line.rstrip("\n") - # print "- - - * * * - - -" - stderr_output = stderr_output.splitlines() - for line in stderr_output: - err.write(line + "\n") - # print line.rstrip("\n") - where=line.find("Correct predictions:") - if where != -1: - print line[where:] - sys.stdout.flush() - correct_predictions.append( line[where+len("Correct predictions: "):] ) - print "Correct predictions: "+correct_predictions[iteration]+"\n" - sys.stdout.flush() - pylog.write("Correct predictions: "+correct_predictions[iteration]+"\n") - log.write( "Correct predictions: "+correct_predictions[iteration]+"\n") - err.write( "Correct predictions: "+correct_predictions[iteration]+"\n") - - where=line.find("Tagging accuracy: ") - if where != -1: - print line[where:] - sys.stdout.flush() - accuracy.append( float(line[where+len("Tagging accuracy: "):]) ) - print "Tagging accuracy = "+str(accuracy[iteration])+"\n" - sys.stdout.flush() - pylog.write("Tagging accuracy = "+str(accuracy[iteration])+"\n") - log.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") - err.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") - - where=line.find("Tagging speed: ") - if where != -1: - print line[where:] - sys.stdout.flush() - tagspeed.append( float(line[where+len("Tagging speed: "):len(line) - len(" tokens per second.")]) ) - print "Tagging speed = "+str(tagspeed[iteration])+"\n" - sys.stdout.flush() - pylog.write("Tagging speed = "+str(tagspeed[iteration])+"\n") - log.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") - err.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") - - if line[0:4]=="real": - test_time.append( float(line[5:]) ) - print "time of execution = "+str(test_time[iteration])+" seconds\n" - sys.stdout.flush() - pylog.write("time of execution = "+str(test_time[iteration])+" seconds\n") - log.write( "time of execution = "+str(test_time[iteration])+" seconds\n") - err.write( "time of execution = "+str(test_time[iteration])+" seconds\n") - # print line - - string_to_write="" - string_to_write=string_to_write+program - string_to_write=string_to_write+","+language - string_to_write=string_to_write+","+features - string_to_write=string_to_write+","+sequence_model_type - string_to_write=string_to_write+","+train_algorithm - string_to_write=string_to_write+","+train_regularization_constant - string_to_write=string_to_write+","+train_epochs - string_to_write=string_to_write+","+form_cutoff - string_to_write=string_to_write+","+prefix_length - string_to_write=string_to_write+","+suffix_length - - #string_to_write=string_to_write+","+str(train_time) #Commented for Windows execution; turn on in Linux environment + test_time=[] + correct_predictions=[] + accuracy=[] + tagspeed=[] + for iteration in range(number_of_runs): + print "\n" + pylog.write("\n") + log.write( "\n") + err.write( "\n") + if number_of_runs > 1: + print "\n**** ITER "+str(iteration)+" ****\n" + sys.stdout.flush() + pylog.write("\n**** ITER "+str(iteration)+" ****\n") + log.write( "\n**** ITER "+str(iteration)+" ****\n") + err.write( "\n**** ITER "+str(iteration)+" ****\n") + print ' '.join(command) + sys.stdout.flush() + pylog.write(' '.join(command) + "\n") + log.write( ' '.join(command) + "\n") + err.write( ' '.join(command) + "\n") + #run program + process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdout__output,stderr_output) = process.communicate() + print "Finished executing: " + ' '.join(command) + sys.stdout.flush() + + # print "- - - * * * - - -" + stdout__output = stdout__output.splitlines() + for line in stdout__output: + log.write(line + "\n") + # print line.rstrip("\n") + # print "- - - * * * - - -" + stderr_output = stderr_output.splitlines() + for line in stderr_output: + err.write(line + "\n") + # print line.rstrip("\n") + where=line.find("Correct predictions:") + if where != -1: + print line[where:] + sys.stdout.flush() + correct_predictions.append( line[where+len("Correct predictions: "):] ) + print "Correct predictions: "+correct_predictions[iteration]+"\n" + sys.stdout.flush() + pylog.write("Correct predictions: "+correct_predictions[iteration]+"\n") + log.write( "Correct predictions: "+correct_predictions[iteration]+"\n") + err.write( "Correct predictions: "+correct_predictions[iteration]+"\n") + + where=line.find("Tagging accuracy: ") + if where != -1: + print line[where:] + sys.stdout.flush() + accuracy.append( float(line[where+len("Tagging accuracy: "):]) ) + print "Tagging accuracy = "+str(accuracy[iteration])+"\n" + sys.stdout.flush() + pylog.write("Tagging accuracy = "+str(accuracy[iteration])+"\n") + log.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") + err.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") + + where=line.find("Tagging speed: ") + if where != -1: + print line[where:] + sys.stdout.flush() + tagspeed.append( float(line[where+len("Tagging speed: "):len(line) - len(" tokens per second.")]) ) + print "Tagging speed = "+str(tagspeed[iteration])+"\n" + sys.stdout.flush() + pylog.write("Tagging speed = "+str(tagspeed[iteration])+"\n") + log.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") + err.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") + + if line[0:4]=="real": + test_time.append( float(line[5:]) ) + print "time of execution = "+str(test_time[iteration])+" seconds\n" + sys.stdout.flush() + pylog.write("time of execution = "+str(test_time[iteration])+" seconds\n") + log.write( "time of execution = "+str(test_time[iteration])+" seconds\n") + err.write( "time of execution = "+str(test_time[iteration])+" seconds\n") + # print line + + string_to_write="" + string_to_write=string_to_write+program + string_to_write=string_to_write+";"+language + string_to_write=string_to_write+";"+features + string_to_write=string_to_write+";"+sequence_model_type + string_to_write=string_to_write+";"+train_algorithm + string_to_write=string_to_write+";"+train_regularization_constant + string_to_write=string_to_write+";"+train_epochs + string_to_write=string_to_write+";"+form_cutoff + string_to_write=string_to_write+";"+prefix_length + string_to_write=string_to_write+";"+suffix_length + + #string_to_write=string_to_write+";"+str(train_time) #Commented for Windows execution; turn on in Linux environment - for i in range(NumberOfRuns): - #string_to_write=string_to_write+","+str(test_time[i]) #Commented for Windows execution; turn on in Linux environment - string_to_write=string_to_write+","+str(correct_predictions[i]) - string_to_write=string_to_write+","+str(accuracy[i]) - string_to_write=string_to_write+","+str(tagspeed[i]) - string_to_write=string_to_write+"\n" - - csv.write(string_to_write) + for i in range(number_of_runs): + #string_to_write=string_to_write+";"+str(test_time[i]) #Commented for Windows execution; turn on in Linux environment + string_to_write=string_to_write+";"+str(correct_predictions[i]) + string_to_write=string_to_write+";"+str(accuracy[i]) + string_to_write=string_to_write+";"+str(tagspeed[i]) + string_to_write=string_to_write+"\n" + + csv.write(string_to_write) - csv.flush() - log.flush() - pylog.flush() - err.flush() + csv.flush() + log.flush() + pylog.flush() + err.flush() #CLOSING csv.close() diff --git a/scripts_ner/entity_script_test_parameters.py b/scripts_ner/entity_script_test_parameters.py index 7766015..c7f3633 100644 --- a/scripts_ner/entity_script_test_parameters.py +++ b/scripts_ner/entity_script_test_parameters.py @@ -3,13 +3,14 @@ import os import subprocess import time +import itertools dir = os.path.abspath( os.path.dirname(__file__) ) -NumberOfRuns=1 -NumberWarmUps=0 +number_of_runs=1 +number_of_warmups=0 -RUNNING_TEST=1 +running_test=1 if len(sys.argv) == 2: Programs = [sys.argv[1]] @@ -28,99 +29,100 @@ sys.exit() -timestr = time.strftime("%Y%m%d-%H%M%S") -OutputLogFilenamePrefix = "*T**__LANGUAGE__*_turbo_entityrecogn_run" -OutputLogFilenameSufix = timestr -OutputLog_Folder = [os.path.join(dir, '..','ner','logs')] +timestr = time.strftime("%Y%m%d-%H%M%S") +output_log_filename_prefix = "*T**__LANGUAGE__*_turbo_entityrecogn_run" +output_log_filename_sufix = timestr +output_log_folder = [os.path.join(dir, '..','ner','logs')] -TrainFiles_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] -DevFiles_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] -TestFiles_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] -GazetteersFiles_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] -ModelFiles_Folder = [os.path.join(dir, '..','ner','models','*__LANGUAGE__*')] -PredictionFiles_Folder = [os.path.join(dir, '..','ner','results','*__LANGUAGE__*')] - -TrainFiles_template = '*__LANGUAGE__*_train.conll.ner' -DevFiles_template = '*__LANGUAGE__*_dev.conll.ner' -TestFiles_template = '*__LANGUAGE__*_test.conll.ner' -GazetteersFiles_template = '*__LANGUAGE__*_all_gazetteers.txt' -ModelFiles_template = '*T**__LANGUAGE__*_entityrecogn.model_mo*__MARKOV_ORDER__*_trc*__REGCONST__*_p*__PREFIX__*s*__SUFFIX__*' -PredictionFiles_template = '*T**__LANGUAGE__*_entityrecogn.model_mo*__MARKOV_ORDER__*_trc*__REGCONST__*_p*__PREFIX__*s*__SUFFIX__*.pred' +train_files_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] +dev_files_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] +test_files_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] +gazetteers_files_Folder = [os.path.join(dir, '..','ner','data','*__LANGUAGE__*')] +model_files_Folder = [os.path.join(dir, '..','ner','models','*__LANGUAGE__*')] +prediction_files_Folder = [os.path.join(dir, '..','ner','results','*__LANGUAGE__*')] + +train_files_template = '*__LANGUAGE__*_train.conll.ner' +dev_files_template = '*__LANGUAGE__*_dev.conll.ner' +test_files_template = '*__LANGUAGE__*_test.conll.ner' +gazetteers_files_template = '*__LANGUAGE__*_all_gazetteers.txt' +model_files_template = '*T**__LANGUAGE__*_entityrecogn.model_mo*__MARKOV_ORDER__*_feat*__FEATURES__*_trc*__REGCONST__*_fco*__form_cutoffs__*_p*__PREFIX__*s*__SUFFIX__*' +prediction_files_template = '*T**__LANGUAGE__*_entityrecogn.model_mo*__MARKOV_ORDER__*_feat*__FEATURES__*_trc*__REGCONST__*_fco*__form_cutoffs__*_p*__PREFIX__*s*__SUFFIX__*.pred' -TrainFiles = [] -DevFiles = [] -TestFiles = [] -GazetteersFiles = [] -ModelFiles = [] -PredictionFiles = [] +train_files = [] +dev_files = [] +test_files = [] +gazetteers_files = [] +model_files = [] +prediction_files = [] -Languages = [# 'basque', 'bulgarian','croatian', 'czech', 'danish', 'finnish', 'greek', 'hungarian', 'italian', 'swedish', +languages = [# 'basque', 'bulgarian','croatian', 'czech', 'danish', 'finnish', 'greek', 'hungarian', 'italian', 'swedish', 'english'] -TrainAlgorithm = ['svm_mira'] #--train_algorithm=svm_mira -TrainRegularizationConstant = ['10','1','0.1','0.01'] #['1.0', '0.1', '0.01'] #--train_regularization_constant=0.01 -TrainEpochs = ['5', '50'] #--train_epochs=20 -SequenceModelType = ['0','1','2'] #--sequence_model_type=0 -FormCutoff = ['0'] #--form_cutoff=0 -PrefixLength = ['0','3'] #['0','2','3'] #--prefix_length=3 -SuffixLength = ['0','3'] #['0','2','3'] #--suffix_length=3 -EntityTaggingScheme = ['bio'] +train_algorithms = ['svm_mira'] #--train_algorithm=svm_mira +train_regularization_constants = ['0.01'] #['1.0', '0.1', '0.01'] #--train_regularization_constant=0.01 +train_epochs_picker = ['50'] #--train_epochs=20 +sequence_model_types = ['0','1','2'] #--sequence_model_type=0 +form_cutoffs = ['0'] #--form_cutoff=0 +prefix_lengths = ['3'] #['0','2','3'] #--prefix_length=3 +suffix_lengths = ['3'] #['0','2','3'] #--suffix_length=3 +entity_tagging_schemes = ['bio'] +entity_features_picker = ['0', '1', '2', '3', '4', '5', '6', '7'] #['0', '1', '2'] #--entity_recognizer_large_feature_set=0 #--logtostderr -if RUNNING_TEST == 1: - ModelFiles_template = ModelFiles_template.replace('*T*', "TEST_") - PredictionFiles_template = PredictionFiles_template.replace('*T*', "TEST_") - OutputLogFilenamePrefix = OutputLogFilenamePrefix.replace('*T*', "TEST_") +if running_test == 1: + model_files_template = model_files_template.replace('*T*', "TEST_") + prediction_files_template = prediction_files_template.replace('*T*', "TEST_") + output_log_filename_prefix = output_log_filename_prefix.replace('*T*', "TEST_") else: - ModelFiles_template = ModelFiles_template.replace('*T*', "") - PredictionFiles_template = PredictionFiles_template.replace('*T*', "") - OutputLogFilenamePrefix = OutputLogFilenamePrefix.replace('*T*', "") + model_files_template = model_files_template.replace('*T*', "") + prediction_files_template = prediction_files_template.replace('*T*', "") + output_log_filename_prefix = output_log_filename_prefix.replace('*T*', "") -if len(Languages) == 1: - OutputLogFilenamePrefix = OutputLogFilenamePrefix.replace('*__LANGUAGE__*', Languages[0]) +if len(languages) == 1: + output_log_filename_prefix = output_log_filename_prefix.replace('*__LANGUAGE__*', languages[0]) else: - OutputLogFilenamePrefix = OutputLogFilenamePrefix.replace('*__LANGUAGE__*', "") + output_log_filename_prefix = output_log_filename_prefix.replace('*__LANGUAGE__*', "") -if not os.path.exists(OutputLog_Folder[0]): - os.makedirs(OutputLog_Folder[0]) +if not os.path.exists(output_log_folder[0]): + os.makedirs(output_log_folder[0]) -for language in Languages: - temp_folder = ModelFiles_Folder[0].replace('*__LANGUAGE__*', language) +for language in languages: + temp_folder = model_files_Folder[0].replace('*__LANGUAGE__*', language) if not os.path.exists(temp_folder): os.makedirs(temp_folder) - temp_folder = PredictionFiles_Folder[0].replace('*__LANGUAGE__*', language) + temp_folder = prediction_files_Folder[0].replace('*__LANGUAGE__*', language) if not os.path.exists(temp_folder): os.makedirs(temp_folder) -csv = open( os.path.join( OutputLog_Folder[0],OutputLogFilenamePrefix+OutputLogFilenameSufix+".csv") ,"wb") -log = open( os.path.join( OutputLog_Folder[0],OutputLogFilenamePrefix+OutputLogFilenameSufix+".log") ,"wb") -err = open( os.path.join( OutputLog_Folder[0],OutputLogFilenamePrefix+OutputLogFilenameSufix+".err") ,"wb") -pylog = open( os.path.join( OutputLog_Folder[0],OutputLogFilenamePrefix+OutputLogFilenameSufix+".pylog") ,"wb") +csv = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".csv") ,"wb") +log = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".log") ,"wb") +err = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".err") ,"wb") +pylog = open( os.path.join( output_log_folder[0],output_log_filename_prefix+output_log_filename_sufix+".pylog") ,"wb") -for i in range(len(TrainFiles_Folder)) : - TrainFiles.append(os.path.join( TrainFiles_Folder[i], TrainFiles_template)) -for i in range(len(DevFiles_Folder)) : - DevFiles.append(os.path.join( DevFiles_Folder[i], DevFiles_template)) -for i in range(len(TestFiles_Folder)) : - TestFiles.append(os.path.join( TestFiles_Folder[i], TestFiles_template)) -for i in range(len(GazetteersFiles_Folder)) : - GazetteersFiles.append(os.path.join( GazetteersFiles_Folder[i], GazetteersFiles_template)) -for i in range(len(ModelFiles_Folder)) : - ModelFiles.append(os.path.join( ModelFiles_Folder[i], ModelFiles_template)) -for i in range(len(PredictionFiles_Folder)) : - PredictionFiles.append(os.path.join( PredictionFiles_Folder[i], PredictionFiles_template)) +for i in range(len(train_files_Folder)) : + train_files.append(os.path.join( train_files_Folder[i], train_files_template)) +for i in range(len(dev_files_Folder)) : + dev_files.append(os.path.join( dev_files_Folder[i], dev_files_template)) +for i in range(len(test_files_Folder)) : + test_files.append(os.path.join( test_files_Folder[i], test_files_template)) +for i in range(len(gazetteers_files_Folder)) : + gazetteers_files.append(os.path.join( gazetteers_files_Folder[i], gazetteers_files_template)) +for i in range(len(model_files_Folder)) : + model_files.append(os.path.join( model_files_Folder[i], model_files_template)) +for i in range(len(prediction_files_Folder)) : + prediction_files.append(os.path.join( prediction_files_Folder[i], prediction_files_template)) string_to_write="" -string_to_write=string_to_write+"Program; Language; Markov Order; Train Algorithm; Regularization Constant; Train Epochs; Form cutoff; Prefix Length; Suffix Length" +string_to_write=string_to_write+"Program; Language; Features; Markov Order; Train Algorithm; Regularization Constant; Train Epochs; Form cutoff; Prefix Length; Suffix Length" #string_to_write=string_to_write+"; "+"Training time" #Commented for Windows execution; turn on in Linux environment -if NumberOfRuns == 1: +if number_of_runs == 1: #string_to_write=string_to_write+"; "+"Run(Test) time" #Commented for Windows execution; turn on in Linux environment string_to_write=string_to_write+"; "+"CorrectPredict" string_to_write=string_to_write+"; "+"Accuracy" string_to_write=string_to_write+"; "+"Speed (token/sec)" else: - for i in range(NumberOfRuns): + for i in range(number_of_runs): #string_to_write=string_to_write+"; "+"Run(Test) time["+str(i)+"]" #Commented for Windows execution; turn on in Linux environment string_to_write=string_to_write+"; "+"CorrectPredict["+str(i)+"]" string_to_write=string_to_write+"; "+"Accuracy["+str(i)+"]" @@ -128,239 +130,235 @@ string_to_write=string_to_write+"\n" csv.write(string_to_write) - -for program in Programs: - for language in Languages: - for sequence_model_type in SequenceModelType: - for train_algorithm in TrainAlgorithm: - for train_regularization_constant in TrainRegularizationConstant: - for train_epochs in TrainEpochs: - for form_cutoff in FormCutoff: - for prefix_length in PrefixLength: - for suffix_length in SuffixLength: - for tagging_scheme in EntityTaggingScheme: - - TrainFile = TrainFiles[0] - TrainFile = TrainFile.replace('*__LANGUAGE__*', language) - DevFile = DevFiles[0] - DevFile = DevFile.replace('*__LANGUAGE__*', language) - - TestFile = TestFiles[0] - TestFile = TestFile.replace('*__LANGUAGE__*', language) - - GazetteersFile = GazetteersFiles[0] - GazetteersFile = GazetteersFile.replace('*__LANGUAGE__*', language) - - ModelFile = ModelFiles[0] - ModelFile = ModelFile.replace('*__LANGUAGE__*', language) - ModelFile = ModelFile.replace('*__MARKOV_ORDER__*', sequence_model_type) - ModelFile = ModelFile.replace('*__REGCONST__*', train_regularization_constant) - ModelFile = ModelFile.replace('*__PREFIX__*', prefix_length) - ModelFile = ModelFile.replace('*__SUFFIX__*', suffix_length) - - PredictionFile = PredictionFiles[0] - PredictionFile = PredictionFile.replace('*__LANGUAGE__*', language) - PredictionFile = PredictionFile.replace('*__MARKOV_ORDER__*', sequence_model_type) - PredictionFile = PredictionFile.replace('*__REGCONST__*', train_regularization_constant) - PredictionFile = PredictionFile.replace('*__PREFIX__*', prefix_length) - PredictionFile = PredictionFile.replace('*__SUFFIX__*', suffix_length) - - #TRAIN - command = [] - #command.append("time -p") #Commented for Windows execution; turn on in Linux environment - command.append(program) - command.append("--train") - command.append("--file_train="+TrainFile) - command.append("--file_model="+ModelFile) - command.append("--entity_file_gazetteer="+GazetteersFile) - command.append("--train_algorithm="+train_algorithm) - command.append("--train_regularization_constant="+train_regularization_constant) - command.append("--sequence_model_type="+sequence_model_type) - command.append("--entity_tagging_scheme="+tagging_scheme) - command.append("--train_epochs="+train_epochs) - command.append("--form_cutoff="+form_cutoff) - command.append("--prefix_length="+prefix_length) - command.append("--suffix_length="+suffix_length) - command.append("--logtostderr") +for program, language, sequence_model_type, features, train_algorithm, train_regularization_constant, train_epochs, form_cutoff, prefix_length, suffix_length, tagging_scheme in itertools.product(Programs, languages, sequence_model_types, entity_features_picker, train_algorithms, train_regularization_constants, train_epochs_picker, form_cutoffs, prefix_lengths, suffix_lengths, entity_tagging_schemes): + train_file = train_files[0] + train_file = train_file.replace('*__LANGUAGE__*', language) + + dev_file = dev_files[0] + dev_file = dev_file.replace('*__LANGUAGE__*', language) + + test_file = test_files[0] + test_file = test_file.replace('*__LANGUAGE__*', language) + + gazetteers_file = gazetteers_files[0] + gazetteers_file = gazetteers_file.replace('*__LANGUAGE__*', language) + + model_file = model_files[0] + model_file = model_file.replace('*__LANGUAGE__*', language) + model_file = model_file.replace('*__MARKOV_ORDER__*', sequence_model_type) + model_file = model_file.replace('*__FEATURES__*', features) + model_file = model_file.replace('*__REGCONST__*', train_regularization_constant) + model_file = model_file.replace('*__form_cutoffs__*', form_cutoff) + model_file = model_file.replace('*__PREFIX__*', prefix_length) + model_file = model_file.replace('*__SUFFIX__*', suffix_length) + + prediction_file = prediction_files[0] + prediction_file = prediction_file.replace('*__LANGUAGE__*', language) + prediction_file = prediction_file.replace('*__MARKOV_ORDER__*', sequence_model_type) + prediction_file = prediction_file.replace('*__FEATURES__*', features) + prediction_file = prediction_file.replace('*__REGCONST__*', train_regularization_constant) + prediction_file = prediction_file.replace('*__form_cutoffs__*', form_cutoff) + prediction_file = prediction_file.replace('*__PREFIX__*', prefix_length) + prediction_file = prediction_file.replace('*__SUFFIX__*', suffix_length) + + #TRAIN + command = [] + #command.append("time -p") #Commented for Windows execution; turn on in Linux environment + command.append(program) + command.append("--train") + command.append("--file_train="+train_file) + command.append("--file_model="+model_file) + command.append("--entity_file_gazetteer="+gazetteers_file) + command.append("--train_algorithm="+train_algorithm) + command.append("--train_regularization_constant="+train_regularization_constant) + command.append("--sequence_model_type="+sequence_model_type) + command.append("--entity_tagging_scheme="+tagging_scheme) + command.append("--train_epochs="+train_epochs) + command.append("--form_cutoff="+form_cutoff) + command.append("--prefix_length="+prefix_length) + command.append("--suffix_length="+suffix_length) + command.append("--entity_recognizer_large_feature_set="+features) + command.append("--logtostderr") + + print "Executing: " + sys.stdout.flush() - print "Executing: " - sys.stdout.flush() + print ' '.join(command) + sys.stdout.flush() + pylog.write(' '.join(command) + "\n") + log.write( ' '.join(command) + "\n") + err.write( ' '.join(command) + "\n") + #run program + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdout__output,stderr_output) = process.communicate() + print "Finished executing: " + ' '.join(command) + sys.stdout.flush() + + # print "- - - * * * - - -" + stdout__output = stdout__output.splitlines() + for line in stdout__output: + log.write(line + "\n") + # print line.rstrip("\n") + + # print "- - - * * * - - -" + stderr_output = stderr_output.splitlines() + for line in stderr_output: + err.write(line + "\n") + # print line.rstrip("\n") + where=line.find("Training took ") + if where != -1: + print line[where:] + sys.stdout.flush() + training_time = float(line[where+len("Training took "):len(line)-len(" sec.")]) + print "Training time = "+str(training_time)+" seconds\n" + sys.stdout.flush() + pylog.write("Training time = "+str(training_time)+" seconds\n") + log.write( "Training time = "+str(training_time)+" seconds\n") + err.write( "Training time = "+str(training_time)+" seconds\n") - print ' '.join(command) - sys.stdout.flush() - pylog.write(' '.join(command) + "\n") - log.write( ' '.join(command) + "\n") - err.write( ' '.join(command) + "\n") - #run program - process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdout__output,stderr_output) = process.communicate() - print "Finished executing: " + ' '.join(command) - sys.stdout.flush() - - # print "- - - * * * - - -" - stdout__output = stdout__output.splitlines() - for line in stdout__output: - log.write(line + "\n") - # print line.rstrip("\n") - - # print "- - - * * * - - -" - stderr_output = stderr_output.splitlines() - for line in stderr_output: - err.write(line + "\n") - # print line.rstrip("\n") - where=line.find("Training took ") - if where != -1: - print line[where:] - sys.stdout.flush() - training_time = float(line[where+len("Training took "):len(line)-len(" sec.")]) - print "Training time = "+str(training_time)+" seconds\n" - sys.stdout.flush() - pylog.write("Training time = "+str(training_time)+" seconds\n") - log.write( "Training time = "+str(training_time)+" seconds\n") - err.write( "Training time = "+str(training_time)+" seconds\n") + if line[0:4]=="real": + train_time = float(line[5:]) + print "time of execution = "+str(train_time)+" seconds\n" + sys.stdout.flush() + pylog.write("time of execution = "+str(train_time)+" seconds\n") + log.write( "time of execution = "+str(train_time)+" seconds\n") + err.write( "time of execution = "+str(train_time)+" seconds\n") + # print line + + #TEST + command = [] + #command.append("time -p") #Commented for Windows execution; turn on in Linux environment + command.append(program) + command.append("--test") + command.append("--evaluate") + command.append("--file_model="+model_file) + command.append("--file_test="+test_file) + command.append("--file_prediction="+prediction_file) + command.append("--entity_tagging_scheme="+tagging_scheme) + command.append("--logtostderr") - if line[0:4]=="real": - train_time = float(line[5:]) - print "time of execution = "+str(train_time)+" seconds\n" - sys.stdout.flush() - pylog.write("time of execution = "+str(train_time)+" seconds\n") - log.write( "time of execution = "+str(train_time)+" seconds\n") - err.write( "time of execution = "+str(train_time)+" seconds\n") - # print line - - #TEST - command = [] - #command.append("time -p") #Commented for Windows execution; turn on in Linux environment - command.append(program) - command.append("--test") - command.append("--evaluate") - command.append("--file_model="+ModelFile) - command.append("--file_test="+TestFile) - command.append("--file_prediction="+PredictionFile) - command.append("--entity_tagging_scheme="+tagging_scheme) - command.append("--logtostderr") + print "Executing: " + sys.stdout.flush() - print "Executing: " - sys.stdout.flush() - - #warm-up X iterations - for iteration in range(0,NumberWarmUps): - print "Warm-up #" + str(iteration+1) +": " + ' '.join(command) - sys.stdout.flush() - pylog.write("Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") - log.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") - err.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") - #run program - process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdout__output,stderr_output) = process.communicate() - print "Finished executing: " + ' '.join(command) - sys.stdout.flush() - - test_time=[] - correct_predictions=[] - accuracy=[] - tagspeed=[] - for iteration in range(NumberOfRuns): - print "\n" - pylog.write("\n") - log.write( "\n") - err.write( "\n") - if NumberOfRuns > 1: - print "\n**** ITER "+str(iteration)+" ****\n" - sys.stdout.flush() - pylog.write("\n**** ITER "+str(iteration)+" ****\n") - log.write( "\n**** ITER "+str(iteration)+" ****\n") - err.write( "\n**** ITER "+str(iteration)+" ****\n") - print ' '.join(command) - sys.stdout.flush() - pylog.write(' '.join(command) + "\n") - log.write( ' '.join(command) + "\n") - err.write( ' '.join(command) + "\n") - #run program - process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdout__output,stderr_output) = process.communicate() - print "Finished executing: " + ' '.join(command) - sys.stdout.flush() - - # print "- - - * * * - - -" - stdout__output = stdout__output.splitlines() - for line in stdout__output: - log.write(line + "\n") - # print line.rstrip("\n") - # print "- - - * * * - - -" - stderr_output = stderr_output.splitlines() - for line in stderr_output: - err.write(line + "\n") - # print line.rstrip("\n") - where=line.find("Correct predictions:") - if where != -1: - print line[where:] - sys.stdout.flush() - correct_predictions.append( line[where+len("Correct predictions: "):] ) - print "Correct predictions: "+correct_predictions[iteration]+"\n" - sys.stdout.flush() - pylog.write("Correct predictions: "+correct_predictions[iteration]+"\n") - log.write( "Correct predictions: "+correct_predictions[iteration]+"\n") - err.write( "Correct predictions: "+correct_predictions[iteration]+"\n") - - where=line.find("Tagging accuracy: ") - if where != -1: - print line[where:] - sys.stdout.flush() - accuracy.append( float(line[where+len("Tagging accuracy: "):]) ) - print "Tagging accuracy = "+str(accuracy[iteration])+"\n" - sys.stdout.flush() - pylog.write("Tagging accuracy = "+str(accuracy[iteration])+"\n") - log.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") - err.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") - - where=line.find("Tagging speed: ") - if where != -1: - print line[where:] - sys.stdout.flush() - tagspeed.append( float(line[where+len("Tagging speed: "):len(line) - len(" tokens per second.")]) ) - print "Tagging speed = "+str(tagspeed[iteration])+"\n" - sys.stdout.flush() - pylog.write("Tagging speed = "+str(tagspeed[iteration])+"\n") - log.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") - err.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") - - if line[0:4]=="real": - test_time.append( float(line[5:]) ) - print "time of execution = "+str(test_time[iteration])+" seconds\n" - sys.stdout.flush() - pylog.write("time of execution = "+str(test_time[iteration])+" seconds\n") - log.write( "time of execution = "+str(test_time[iteration])+" seconds\n") - err.write( "time of execution = "+str(test_time[iteration])+" seconds\n") - # print line - - string_to_write="" - string_to_write=string_to_write+program - string_to_write=string_to_write+";"+language - string_to_write=string_to_write+";"+sequence_model_type - string_to_write=string_to_write+";"+train_algorithm - string_to_write=string_to_write+";"+train_regularization_constant - string_to_write=string_to_write+";"+train_epochs - string_to_write=string_to_write+";"+form_cutoff - string_to_write=string_to_write+";"+prefix_length - string_to_write=string_to_write+";"+suffix_length - - #string_to_write=string_to_write+";"+str(train_time) #Commented for Windows execution; turn on in Linux environment + #warm-up X iterations + for iteration in range(0,number_of_warmups): + print "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + sys.stdout.flush() + pylog.write("Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") + log.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") + err.write( "Warm-up #" + str(iteration+1) +": " + ' '.join(command) + "\n") + #run program + process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdout__output,stderr_output) = process.communicate() + print "Finished executing: " + ' '.join(command) + sys.stdout.flush() - for i in range(NumberOfRuns): - #string_to_write=string_to_write+";"+str(test_time[i]) #Commented for Windows execution; turn on in Linux environment - string_to_write=string_to_write+";"+str(correct_predictions[i]) - string_to_write=string_to_write+";"+str(accuracy[i]) - string_to_write=string_to_write+";"+str(tagspeed[i]) - string_to_write=string_to_write+"\n" - - csv.write(string_to_write) + test_time=[] + correct_predictions=[] + accuracy=[] + tagspeed=[] + for iteration in range(number_of_runs): + print "\n" + pylog.write("\n") + log.write( "\n") + err.write( "\n") + if number_of_runs > 1: + print "\n**** ITER "+str(iteration)+" ****\n" + sys.stdout.flush() + pylog.write("\n**** ITER "+str(iteration)+" ****\n") + log.write( "\n**** ITER "+str(iteration)+" ****\n") + err.write( "\n**** ITER "+str(iteration)+" ****\n") + print ' '.join(command) + sys.stdout.flush() + pylog.write(' '.join(command) + "\n") + log.write( ' '.join(command) + "\n") + err.write( ' '.join(command) + "\n") + #run program + process=subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdout__output,stderr_output) = process.communicate() + print "Finished executing: " + ' '.join(command) + sys.stdout.flush() + + # print "- - - * * * - - -" + stdout__output = stdout__output.splitlines() + for line in stdout__output: + log.write(line + "\n") + # print line.rstrip("\n") + # print "- - - * * * - - -" + stderr_output = stderr_output.splitlines() + for line in stderr_output: + err.write(line + "\n") + # print line.rstrip("\n") + where=line.find("Correct predictions:") + if where != -1: + print line[where:] + sys.stdout.flush() + correct_predictions.append( line[where+len("Correct predictions: "):] ) + print "Correct predictions: "+correct_predictions[iteration]+"\n" + sys.stdout.flush() + pylog.write("Correct predictions: "+correct_predictions[iteration]+"\n") + log.write( "Correct predictions: "+correct_predictions[iteration]+"\n") + err.write( "Correct predictions: "+correct_predictions[iteration]+"\n") + + where=line.find("Tagging accuracy: ") + if where != -1: + print line[where:] + sys.stdout.flush() + accuracy.append( float(line[where+len("Tagging accuracy: "):]) ) + print "Tagging accuracy = "+str(accuracy[iteration])+"\n" + sys.stdout.flush() + pylog.write("Tagging accuracy = "+str(accuracy[iteration])+"\n") + log.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") + err.write( "Tagging accuracy = "+str(accuracy[iteration])+"\n") + + where=line.find("Tagging speed: ") + if where != -1: + print line[where:] + sys.stdout.flush() + tagspeed.append( float(line[where+len("Tagging speed: "):len(line) - len(" tokens per second.")]) ) + print "Tagging speed = "+str(tagspeed[iteration])+"\n" + sys.stdout.flush() + pylog.write("Tagging speed = "+str(tagspeed[iteration])+"\n") + log.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") + err.write( "Tagging speed = "+str(tagspeed[iteration])+"\n") + + if line[0:4]=="real": + test_time.append( float(line[5:]) ) + print "time of execution = "+str(test_time[iteration])+" seconds\n" + sys.stdout.flush() + pylog.write("time of execution = "+str(test_time[iteration])+" seconds\n") + log.write( "time of execution = "+str(test_time[iteration])+" seconds\n") + err.write( "time of execution = "+str(test_time[iteration])+" seconds\n") + # print line + + string_to_write="" + string_to_write=string_to_write+program + string_to_write=string_to_write+";"+language + string_to_write=string_to_write+";"+features + string_to_write=string_to_write+";"+sequence_model_type + string_to_write=string_to_write+";"+train_algorithm + string_to_write=string_to_write+";"+train_regularization_constant + string_to_write=string_to_write+";"+train_epochs + string_to_write=string_to_write+";"+form_cutoff + string_to_write=string_to_write+";"+prefix_length + string_to_write=string_to_write+";"+suffix_length + + #string_to_write=string_to_write+";"+str(train_time) #Commented for Windows execution; turn on in Linux environment + + for i in range(number_of_runs): + #string_to_write=string_to_write+";"+str(test_time[i]) #Commented for Windows execution; turn on in Linux environment + string_to_write=string_to_write+";"+str(correct_predictions[i]) + string_to_write=string_to_write+";"+str(accuracy[i]) + string_to_write=string_to_write+";"+str(tagspeed[i]) + string_to_write=string_to_write+"\n" + + csv.write(string_to_write) - csv.flush() - log.flush() - pylog.flush() - err.flush() + csv.flush() + log.flush() + pylog.flush() + err.flush() #CLOSING csv.close() diff --git a/src/classifier/Alphabet.h b/src/classifier/Alphabet.h index 5592213..32d727a 100644 --- a/src/classifier/Alphabet.h +++ b/src/classifier/Alphabet.h @@ -20,11 +20,7 @@ #define ALPHABET_H_ #include "Utils.h" -#ifdef _WIN32 #include -#else -#include -#endif #include #ifdef _WIN32 #include @@ -35,7 +31,7 @@ using namespace std; // This class implements a dictionary of strings, stored as an hash table // for fast lookup. It allows looking up a string for its ID, and also obtain // the string with a given ID. -class Alphabet : public std::tr1::unordered_map { +class Alphabet : public std::unordered_map { public: Alphabet(); virtual ~Alphabet(); @@ -48,7 +44,7 @@ class Alphabet : public std::tr1::unordered_map { void clear() { num_entries_ = 0; names_.clear(); - std::tr1::unordered_map ::clear(); + std::unordered_map ::clear(); } // Return the dictionary size. diff --git a/src/classifier/Features.h b/src/classifier/Features.h index 811c732..363b67f 100644 --- a/src/classifier/Features.h +++ b/src/classifier/Features.h @@ -27,10 +27,6 @@ #include #include -#ifndef USE_WEIGHT_CACHING -#define USE_WEIGHT_CACHING 0 //1 -#endif - // A vector of binary features. Each feature is represented as a 64-bit key. typedef vector BinaryFeatures; diff --git a/src/classifier/Options.cpp b/src/classifier/Options.cpp index a6d3cbe..19ed83a 100644 --- a/src/classifier/Options.cpp +++ b/src/classifier/Options.cpp @@ -53,6 +53,8 @@ DEFINE_double(train_regularization_constant, 1e12, "Regularization parameter C."); DEFINE_int32(parameters_max_num_buckets, 50000000, "Maximum number of buckets in the hash table that stores the parameters."); +DEFINE_int32(save_model_period, 1000000, + "Number of iteration after which a temporaty model is saved."); void Options::Initialize() { file_train_ = FLAGS_file_train; @@ -73,4 +75,5 @@ void Options::Initialize() { train_learning_rate_schedule_ = FLAGS_train_learning_rate_schedule; only_supported_features_ = FLAGS_only_supported_features; use_averaging_ = FLAGS_use_averaging; + save_model_period_ = FLAGS_save_model_period; } diff --git a/src/classifier/Options.h b/src/classifier/Options.h index 2540bfe..835764f 100644 --- a/src/classifier/Options.h +++ b/src/classifier/Options.h @@ -1,126 +1,138 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef OPTIONS_H_ -#define OPTIONS_H_ - -#include -#include -#include - -using namespace std; - -DECLARE_bool(train); -DECLARE_bool(test); -DECLARE_bool(evaluate); - -DECLARE_string(train_algorithm); -DECLARE_bool(only_supported_features); -DECLARE_bool(use_averaging); -DECLARE_int32(train_epochs); -DECLARE_double(train_regularization_constant); -DECLARE_double(train_initial_learning_rate); -DECLARE_string(train_learning_rate_schedule); - -DECLARE_int32(parameters_max_num_buckets); - -//1 to use new developments regarding performance optimizations -#ifndef USE_N_OPTIMIZATIONS -#define USE_N_OPTIMIZATIONS 0 //1 -#endif - -// General training/test options. -class Options { -public: - Options() {}; - virtual ~Options() {}; - - // Serialization functions. - // Load current option flags to the model file. - // Note: this will override the user-specified flags. - virtual void Save(FILE* fs) {}; - // Save current option flags to the model file. - virtual void Load(FILE* fs) {}; - - // Initialization: set options based on the flags. - virtual void Initialize(); - - // Get option values. - const std::string &GetTrainingFilePath() { return file_train_; }; - const std::string &GetTestFilePath() { return file_test_; }; - const std::string &GetModelFilePath() { return file_model_; }; - const std::string &GetOutputFilePath() { return file_prediction_; }; - int GetNumEpochs() { return train_epochs_; }; - double GetRegularizationConstant() { return train_regularization_constant_; } - const std::string &GetTrainingAlgorithm() { return train_algorithm_; } - double GetInitialLearningRate() { return train_initial_learning_rate_; } - const std::string &GetLearningRateSchedule() { - return train_learning_rate_schedule_; - } - bool use_averaging() { return use_averaging_; } - bool only_supported_features() { return only_supported_features_; } - bool train() { return train_; } - bool test() { return test_; } - bool evaluate() { return evaluate_; } - - // Set option values. - void SetTrainingFilePath(const std::string &file_train) { - file_train_ = file_train; - } - void SetTestFilePath(const std::string &file_test) { - file_test_ = file_test; - } - void SetModelFilePath(const std::string &file_model) { - file_model_ = file_model; - } - void SetOutputFilePath(const std::string &file_prediction) { - file_prediction_ = file_prediction; - } - -protected: - std::string file_train_; - std::string file_test_; - std::string file_model_; - std::string file_prediction_; - bool train_; - bool test_; - bool evaluate_; - int train_epochs_; - - // The regularization constant (C). The training optimization problem is: - // min 1/(2C)*||w||^2 + sum_t(loss(w; x_t,y_t)). - double train_regularization_constant_; - - // The algorithm used to train the model. Alternatives are: - // -- perceptron - // -- mira - // -- svm_mira - // -- crf_mira - // -- svm_sgd - // -- crf_sgd - std::string train_algorithm_; - - // Learning rate and its decay schedule (for SGD only). - double train_initial_learning_rate_; - std::string train_learning_rate_schedule_; - - bool only_supported_features_; // Use only supported features. - bool use_averaging_; // Include a final averaging step during training. -}; - -#endif /*OPTIONS_H_*/ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef OPTIONS_H_ +#define OPTIONS_H_ + +#include +#include +#include + +using namespace std; + +DECLARE_bool(train); +DECLARE_bool(test); +DECLARE_bool(evaluate); + +DECLARE_string(train_algorithm); +DECLARE_bool(only_supported_features); +DECLARE_bool(use_averaging); +DECLARE_int32(train_epochs); +DECLARE_double(train_regularization_constant); +DECLARE_double(train_initial_learning_rate); +DECLARE_string(train_learning_rate_schedule); + +DECLARE_int32(parameters_max_num_buckets); + +DECLARE_int32(save_model_period); + +//1 to use new developments regarding performance optimizations +#ifndef USE_N_OPTIMIZATIONS +#define USE_N_OPTIMIZATIONS 0 //1 +#endif + +class Pipe; + +// General training/test options. +class Options { +public: + Options() {}; + virtual ~Options() {}; + + // Serialization functions. + // Load current option flags to the model file. + // Note: this will override the user-specified flags. + virtual void Save(FILE* fs) {}; + // Save current option flags to the model file. + virtual void Load(FILE* fs) {}; + + // Initialization: set options based on the flags. + virtual void Initialize(); + + // Get option values. + const std::string &GetTrainingFilePath() { return file_train_; }; + const std::string &GetTestFilePath() { return file_test_; }; + const std::string &GetModelFilePath() { return file_model_; }; + const std::string &GetOutputFilePath() { return file_prediction_; }; + int GetNumEpochs() { return train_epochs_; }; + double GetRegularizationConstant() { return train_regularization_constant_; } + const std::string &GetTrainingAlgorithm() { return train_algorithm_; } + double GetInitialLearningRate() { return train_initial_learning_rate_; } + const std::string &GetLearningRateSchedule() { + return train_learning_rate_schedule_; + } + bool use_averaging() { return use_averaging_; } + bool only_supported_features() { return only_supported_features_; } + bool train() { return train_; } + bool test() { return test_; } + bool evaluate() { return evaluate_; } + int save_model_period() { return save_model_period_; } + + // Set option values. + void SetTrainingFilePath(const std::string &file_train) { + file_train_ = file_train; + } + void SetTestFilePath(const std::string &file_test) { + file_test_ = file_test; + } + void SetModelFilePath(const std::string &file_model) { + file_model_ = file_model; + } + void SetOutputFilePath(const std::string &file_prediction) { + file_prediction_ = file_prediction; + } + + // Set a pointer to the pipe that owns this options handler. + void SetPipe(Pipe *pipe) { + pipe_ = pipe; + } + +protected: + Pipe *pipe_ = nullptr; // The pipe that owns this options handler. + std::string file_train_; + std::string file_test_; + std::string file_model_; + std::string file_prediction_; + bool train_; + bool test_; + bool evaluate_; + int train_epochs_; + + // The regularization constant (C). The training optimization problem is: + // min 1/(2C)*||w||^2 + sum_t(loss(w; x_t,y_t)). + double train_regularization_constant_; + + // The algorithm used to train the model. Alternatives are: + // -- perceptron + // -- mira + // -- svm_mira + // -- crf_mira + // -- svm_sgd + // -- crf_sgd + std::string train_algorithm_; + + // Learning rate and its decay schedule (for SGD only). + double train_initial_learning_rate_; + std::string train_learning_rate_schedule_; + + bool only_supported_features_; // Use only supported features. + bool use_averaging_; // Include a final averaging step during training. + int save_model_period_; // Number of iteration after which a temporaty model is saved. +}; + +#endif /*OPTIONS_H_*/ diff --git a/src/classifier/Parameters.h b/src/classifier/Parameters.h index d526e35..0702a2a 100644 --- a/src/classifier/Parameters.h +++ b/src/classifier/Parameters.h @@ -24,77 +24,6 @@ #include "SparseLabeledParameterVector.h" #include "Utils.h" -#if USE_WEIGHT_CACHING == 1 -// Structure to define a feature-label pair. -struct FeatureLabelPair { - uint64_t feature; - int label; -}; - -// Structure to define a hash function and a comparison function for FeatureLabelPair. -struct FeatureLabelPairMapper { - template - inline void HashCombine(TSeed value, TSeed *seed) const { - *seed ^= value + 0x9e3779b9 + (*seed << 6) + (*seed >> 2); - } - // Hash function. - inline size_t operator()(const FeatureLabelPair& p) const { - size_t hash = std::hash()(p.feature); - size_t hash_2 = std::hash()(p.label); - - HashCombine(hash_2, &hash); - return hash; - } - // Comparison function. - inline bool operator()(const FeatureLabelPair &p, const FeatureLabelPair &q) const { - return p.feature == q.feature && p.label == q.label; - } -}; - -// Defines a hash-table of FeatureLabelPair keys with values, of double type. -typedef std::unordered_map FeatureLabelPairHashMap; - -// Hash-table for caching FeatureLabelPair keys with corresponding values. -class FeatureLabelCache { -public: - FeatureLabelCache() { - hits_ = 0; - misses_ = 0; - }; - virtual ~FeatureLabelCache() {}; - - int hits() const { return hits_; }; - int misses() const { return misses_; }; - int GetSize() const { return cache_.size(); }; - - void IncrementHits() { hits_ += 1; }; - void IncrementMisses() { misses_ += 1; }; - - // Insert a new pair {key, value} in the hash-table. - void Insert(FeatureLabelPair key, double value) { - cache_.insert({ key, value }); - }; - - // Searches for a given key in the hash-table. - // If found, value is returned in argument 'value'. - // return: true if found, false otherwise. - bool Find(FeatureLabelPair key, double * value) { - FeatureLabelPairHashMap::const_iterator caching_iterator; - caching_iterator = cache_.find(key); - if (caching_iterator != cache_.end()) { - *value = caching_iterator->second; - return true; - }; - return false; - }; - -protected: - FeatureLabelPairHashMap cache_; - uint64_t hits_; - uint64_t misses_; -}; -#endif - // This class implements a feature vector, which is convenient to sum over // binary features, weight them, etc. It just uses the classes // SparseParameterVector and SparseLabeledParameterVector, which allow fast @@ -148,6 +77,29 @@ class Parameters { if (use_average_) averaged_labeled_weights_.Initialize(); } + // Overwrite + void Overwrite(Parameters *output_parameters){ + output_parameters->use_average_=use_average_; // could be removed + + weights_.Overwrite(&output_parameters->weights_); + averaged_weights_.Overwrite(&output_parameters->averaged_weights_); + + labeled_weights_.Overwrite(&output_parameters->labeled_weights_); + averaged_labeled_weights_.Overwrite(&output_parameters->averaged_labeled_weights_); + } + + // Copy Parameters. + void Copy(Parameters *output_parameters){ + output_parameters->Initialize(use_average_); + output_parameters->use_average_=use_average_; // could be removed + + weights_.Copy(&output_parameters->weights_); + averaged_weights_.Copy(&output_parameters->averaged_weights_); + + labeled_weights_.Copy(&output_parameters->labeled_weights_); + averaged_labeled_weights_.Copy(&output_parameters->averaged_labeled_weights_); + } + // Lock/unlock the parameter vector. A locked vector means that no features // can be added. void StopGrowth() { @@ -221,53 +173,6 @@ class Parameters { } } -#if USE_WEIGHT_CACHING == 1 - // Compute the scores corresponding to a set of features, conjoined with - // output labels. The vector scores, provided as output, contains the score - // for each label, with the added functionality - // of using a cache for already computed scores. - void ComputeLabelScoresWithCache(const BinaryFeatures &features, - const vector &labels, - vector *scores) { - FeatureLabelPair caching_key; - double caching_value; - - vector reduced_labels; - vector adjust_new_index_reduced_labels; - - scores->clear(); - scores->resize(labels.size(), 0.0); - vector label_scores(labels.size(), 0.0); - for (int j = 0; j < features.size(); ++j) { - if (!ExistsLabeled(features[j])) continue; - reduced_labels.clear(); - adjust_new_index_reduced_labels.clear(); - - for (int k = 0; k < labels.size(); ++k) { - caching_key = { features[j], labels[k] }; - if (!caching_weights_.Find(caching_key, &caching_value)) { - // Add such label to reduced labels. - reduced_labels.push_back(labels[k]); - adjust_new_index_reduced_labels.push_back(k); - caching_weights_.IncrementMisses(); - } else { - (*scores)[k] += caching_value; - caching_weights_.IncrementHits(); - } - } - if (reduced_labels.size() == 0) continue; - if (!Get(features[j], reduced_labels, &label_scores)) continue; - for (int k = 0; k < reduced_labels.size(); ++k) { - (*scores)[adjust_new_index_reduced_labels[k]] += label_scores[k]; - - caching_key = { features[j], reduced_labels[k] }; - caching_value = label_scores[k]; - caching_weights_.Insert(caching_key, caching_value); - } - } - } -#endif - // Scale the parameter vector by scale_factor. void Scale(double scale_factor) { weights_.Scale(scale_factor); @@ -332,12 +237,6 @@ class Parameters { } } -#if USE_WEIGHT_CACHING == 1 - int GetCachingWeightsHits() const { return caching_weights_.hits(); }; - int GetCachingWeightsMisses() const { return caching_weights_.misses(); }; - int GetCachingWeightsSize() const { return caching_weights_.GetSize(); }; -#endif - protected: // Average the parameters as in averaged perceptron. bool use_average_; @@ -349,13 +248,6 @@ class Parameters { // Weights and averaged weights for the "labeled" features. SparseLabeledParameterVector labeled_weights_; SparseLabeledParameterVector averaged_labeled_weights_; - -public: -#if USE_WEIGHT_CACHING == 1 - // Caches the weights for feature-label pairs : - // FeatureLabelPair = struct {feature; label} . - FeatureLabelCache caching_weights_; -#endif }; #endif /*PARAMETERS_H_*/ diff --git a/src/classifier/Pipe.cpp b/src/classifier/Pipe.cpp index 2a71b7b..847471a 100644 --- a/src/classifier/Pipe.cpp +++ b/src/classifier/Pipe.cpp @@ -24,6 +24,7 @@ Pipe::Pipe(Options* options) { options_ = options; + options->SetPipe(this); dictionary_ = NULL; reader_ = NULL; writer_ = NULL; @@ -150,14 +151,25 @@ void Pipe::Train() { for (int i = 0; i < options_->GetNumEpochs(); ++i) { TrainEpoch(i); + if (options_->save_model_period() == 1 || (i && i%(options_->save_model_period()) == 0)){ + Parameters *parameters_aux = new Parameters; + parameters_->Copy(parameters_aux); + parameters_->Finalize((i+1) * (int)instances_.size()); + SaveModelByName(options_->GetModelFilePath() + ".temp." + std::to_string(i)); + //parameters_aux->Copy(parameters_); + //parameters_aux->Overwrite(parameters_); + Parameters *aux = parameters_; + parameters_ = parameters_aux; + delete aux; + } } - parameters_->Finalize(options_->GetNumEpochs() * instances_.size()); + parameters_->Finalize(options_->GetNumEpochs() * (int)instances_.size()); } void Pipe::CreateInstances() { - timeval start, end; - gettimeofday(&start, NULL); + chronowrap::Chronometer chrono; + chrono.GetTime(); LOG(INFO) << "Creating instances..."; @@ -172,8 +184,8 @@ void Pipe::CreateInstances() { LOG(INFO) << "Number of instances: " << instances_.size(); - gettimeofday(&end, NULL); - LOG(INFO) << "Time: " << diff_ms(end, start); + chrono.StopTime(); + LOG(INFO) << "Time: " << chrono.GetElapsedTime() << " sec."; } void Pipe::MakeSupportedParameters() { @@ -215,13 +227,13 @@ void Pipe::TrainEpoch(int epoch) { double total_cost = 0.0; double total_loss = 0.0; double eta; - int num_instances = instances_.size(); + int num_instances = (int)instances_.size(); double lambda = 1.0 / (options_->GetRegularizationConstant() * (static_cast(num_instances))); - timeval start, end; - gettimeofday(&start, NULL); - int time_decoding = 0; - int time_scores = 0; + chronowrap::Chronometer chrono; + chrono.GetTime(); + double time_decoding = 0; + double time_scores = 0; int num_mistakes = 0; if (epoch == 0) { @@ -246,11 +258,11 @@ void Pipe::TrainEpoch(int epoch) { RemoveUnsupportedFeatures(instance, parts, features); } - timeval start_scores, end_scores; - gettimeofday(&start_scores, NULL); + chronowrap::Chronometer chrono_scores; + chrono_scores.GetTime(); ComputeScores(instance, parts, features, &scores); - gettimeofday(&end_scores, NULL); - time_scores += diff_ms(end_scores, start_scores); + chrono_scores.StopTime(); + time_scores += chrono_scores.GetElapsedTime(); // This is a no-op by default. But it's convenient to have it here to build // latent-variable structured classifiers (e.g. for coreference resolution). @@ -259,11 +271,11 @@ void Pipe::TrainEpoch(int epoch) { if (options_->GetTrainingAlgorithm() == "perceptron" || options_->GetTrainingAlgorithm() == "mira") { - timeval start_decoding, end_decoding; - gettimeofday(&start_decoding, NULL); + chronowrap::Chronometer chrono_decoding; + chrono_decoding.GetTime(); decoder_->Decode(instance, parts, scores, &predicted_outputs); - gettimeofday(&end_decoding, NULL); - time_decoding += diff_ms(end_decoding, start_decoding); + chrono_decoding.StopTime(); + time_decoding += chrono_decoding.GetElapsedTime(); if (options_->GetTrainingAlgorithm() == "perceptron") { for (int r = 0; r < parts->size(); ++r) { @@ -285,8 +297,8 @@ void Pipe::TrainEpoch(int epoch) { options_->GetTrainingAlgorithm() == "crf_sgd" || options_->GetTrainingAlgorithm() == "crf_margin_sgd") { double loss; - timeval start_decoding, end_decoding; - gettimeofday(&start_decoding, NULL); + chronowrap::Chronometer chrono_decoding; + chrono_decoding.GetTime(); if (options_->GetTrainingAlgorithm() == "svm_mira" || options_->GetTrainingAlgorithm() == "svm_sgd") { // Do cost-augmented inference. @@ -310,8 +322,8 @@ void Pipe::TrainEpoch(int epoch) { &predicted_outputs, &entropy, &loss); CHECK_GE(entropy, 0.0); } - gettimeofday(&end_decoding, NULL); - time_decoding += diff_ms(end_decoding, start_decoding); + chrono_decoding.StopTime(); + time_decoding += chrono_decoding.GetElapsedTime(); loss -= inner_loss; if (loss < 0.0) { @@ -379,10 +391,10 @@ void Pipe::TrainEpoch(int epoch) { delete parts; delete features; - gettimeofday(&end, NULL); - LOG(INFO) << "Time: " << diff_ms(end, start); - LOG(INFO) << "Time to score: " << time_scores; - LOG(INFO) << "Time to decode: " << time_decoding; + chrono.StopTime(); + LOG(INFO) << "Time: " << chrono.GetElapsedTime() << " sec."; + LOG(INFO) << "Time to score: " << time_scores << " sec."; + LOG(INFO) << "Time to decode: " << time_decoding << " sec."; LOG(INFO) << "Number of Features: " << parameters_->Size(); if (options_->GetTrainingAlgorithm() == "perceptron" || options_->GetTrainingAlgorithm() == "mira") { @@ -403,8 +415,8 @@ void Pipe::Run() { vector gold_outputs; vector predicted_outputs; - timeval start, end; - gettimeofday(&start, NULL); + chronowrap::Chronometer chrono; + chrono.GetTime(); if (options_->evaluate()) BeginEvaluation(); @@ -430,6 +442,7 @@ void Pipe::Run() { } writer_->Write(output_instance); + writer_->WriteFormatted(this, formatted_instance); if (formatted_instance != instance) delete formatted_instance; delete output_instance; @@ -445,15 +458,9 @@ void Pipe::Run() { writer_->Close(); reader_->Close(); - gettimeofday(&end, NULL); + chrono.StopTime(); LOG(INFO) << "Number of instances: " << num_instances; - LOG(INFO) << "Time: " << diff_ms(end, start); - -#if USE_WEIGHT_CACHING == 1 - LOG(INFO) << "Cache size: " << parameters_->GetCachingWeightsSize() << "\t" - << "Cache hits: " << parameters_->GetCachingWeightsHits() << "\t" - << "Cache misses: " << parameters_->GetCachingWeightsMisses() << endl; -#endif + LOG(INFO) << "Time: " << chrono.GetElapsedTime() << " sec."; if (options_->evaluate()) EndEvaluation(); } @@ -469,6 +476,15 @@ void Pipe::ClassifyInstance(Instance *instance) { // Create parts for this instance. MakeParts(formatted_instance, parts, &gold_outputs); + + if (parts->empty()) { + if (formatted_instance != instance) + delete formatted_instance; + delete parts; + delete features; + return; + } + // Create features for the parts of this instance. MakeFeatures(formatted_instance, parts, features); // Compute scores based on the features and parts of this instance. @@ -486,8 +502,9 @@ void Pipe::ClassifyInstance(Instance *instance) { predicted_outputs); } - if (formatted_instance != instance) delete formatted_instance; - + if (formatted_instance != instance) + delete formatted_instance; delete parts; delete features; + return; } diff --git a/src/classifier/Pipe.h b/src/classifier/Pipe.h index 53a3bb2..3549a5a 100644 --- a/src/classifier/Pipe.h +++ b/src/classifier/Pipe.h @@ -1,268 +1,276 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef PIPE_H_ -#define PIPE_H_ - -#include "Dictionary.h" -#include "Features.h" -#include "Part.h" -#include "Reader.h" -#include "Writer.h" -#include "Options.h" -#include "Decoder.h" -#include "Parameters.h" -#include "AlgUtils.h" - -// Abstract class for the structured classifier mainframe. -// It requires parts, features, a dictionary, a reader and writer, and -// instances, all of which are abstract classes. -// Task-specific classifiers should derive from this class and implement the -// pure virtual methods. -class Pipe { -public: - // Constructor/destructor. - Pipe() {}; - Pipe(Options* options); - virtual ~Pipe(); - - // Save/load the model to/from a file. - void SaveModelFile() { SaveModelByName(options_->GetModelFilePath()); } - void LoadModelFile() { LoadModelByName(options_->GetModelFilePath()); } - - // Initialize. Override this method for task-specific initialization. - virtual void Initialize(); - - // Get options. - Options* GetOptions() { return options_; }; - - // Get/Set parameters. - Parameters *GetParameters() { return parameters_; } - void SetParameters(Parameters *parameters) { parameters_ = parameters; } - - // Train the classifier. - void Train(); - - // Run a previously trained classifier on new data. - void Run(); - - // Run a previously trained classifier on a single instance. - void ClassifyInstance(Instance *instance); - -protected: - // Create basic objects. - virtual void CreateDictionary() = 0; - virtual void CreateReader() = 0; - virtual void CreateWriter() = 0; - virtual void CreateDecoder() = 0; - virtual Parts *CreateParts() = 0; - virtual Features *CreateFeatures() = 0; - - // Save/load model. - void SaveModelByName(const std::string &model_name); - void LoadModelByName(const std::string &model_name); - virtual void SaveModel(FILE* fs); - virtual void LoadModel(FILE* fs); - - // Create/add/delete instances. - void DeleteInstances() { - for (int i = 0; i < instances_.size(); ++i) { - delete instances_[i]; - } - instances_.clear(); - } - - void AddInstance(Instance *instance) { - Instance *formatted_instance = GetFormattedInstance(instance); - instances_.push_back(formatted_instance); - if (instance != formatted_instance) delete instance; - } - - // Obtain a "formatted" instance. Override this function for task-specific - // formatted instances, which may be different from instance since they - // may have extra information, data in numeric format for faster - // processing, etc. - virtual Instance *GetFormattedInstance(Instance *instance) { - return instance; - } - - // Create a vector of instances by reading the training data. - void CreateInstances(); - - // Construct the vector of parts for a particular instance. - // Eventually, obtain the binary vector of gold outputs (one entry per part) - // if this information is available. - // Note: this function is task-specific and needs to be implemented by the - // deriving class. - virtual void MakeParts(Instance *instance, Parts *parts, - vector *gold_outputs) = 0; - - // Construct the vector of features for a particular instance and given the - // parts. The vector will be of the same size as the vector of parts. - void MakeFeatures(Instance *instance, Parts *parts, Features *features) { - vector selected_parts(parts->size(), true); - MakeSelectedFeatures(instance, parts, selected_parts, features); - } - - // Construct the vector of features for a particular instance and given a - // selected set of parts (parts which are selected as marked as true). The - // vector will be of the same size as the vector of parts. - // Note: this function is task-specific and needs to be implemented by the - // deriving class. - virtual void MakeSelectedFeatures(Instance *instance, Parts *parts, - const vector &selected_parts, - Features *features) = 0; - - // Given an instance, parts, and features, compute the scores. This will - // look at the current parameters. Each part will receive a score, so the - // vector of scores will be of the same size as the vector of parts. - // NOTE: Override this method for task-specific score computation (e.g. - // to handle labeled features, etc.). - // TODO: handle labeled features here instead of having to override. - virtual void ComputeScores(Instance *instance, Parts *parts, - Features *features, - vector *scores); - - // Perform a gradient step with stepsize eta. The iteration number is - // provided as input since it may be necessary to keep track of the averaged - // weights. The gold output and the predicted output are also provided. - // The meaning of "predicted_output" depends on the training algorithm. - // In perceptron, it is the most likely output predicted by the model. - // In cost-augmented MIRA and structured SVMs, it is the cost-augmented - // prediction. - // In CRFs, it is the vector of posterior marginals for the parts. - // TODO: use "FeatureVector *difference" as input (see function - // MakeFeatureDifference(...) instead of computing on the fly). - virtual void MakeGradientStep(Parts *parts, Features *features, double eta, - int iteration, - const vector &gold_output, - const vector &predicted_output); - - // Compute the difference between the predicted feature vector and the gold - // one. - // The meaning of "predicted_output" depends on the training algorithm. - // In perceptron, it is the most likely output predicted by the model. - // In cost-augmented MIRA and structured SVMs, it is the cost-augmented - // prediction. - // In CRFs, it is the vector of posterior marginals for the parts. - virtual void MakeFeatureDifference(Parts *parts, - Features *features, - const vector &gold_output, - const vector &predicted_output, - FeatureVector *difference); - - // Given an instance, a vector of parts, and features for those parts, - // remove all the features which are not supported, i.e., that were not - // previously created in the parameter vector. This is used for training - // with supported features (flag --only_supported_features). - void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, - Features *features) { - vector selected_parts(parts->size(), true); - RemoveUnsupportedFeatures(instance, parts, selected_parts, features); - } - - // Given an instance, a vector of selected parts, and features for those - // parts, remove all the features which are not supported. See description - // above. - virtual void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, - const vector &selected_parts, - Features *features); - - // Given a vector of parts, and features for those parts, "touch" all the - // parameters corresponding to those features. This will be a no-op for the - // parameters that exist already, and will create a parameter with a zero - // weight otherwise. This is used in a preprocessing stage for training - // with supported features (flag --only_supported_features). - virtual void TouchParameters(Parts *parts, Features *features, - const vector &selected_parts); - - // This is a no-op by default. But it's convenient to have it here to build - // latent-variable structured classifiers (e.g. for coreference resolution). - virtual void TransformGold(Instance *instance, - Parts *parts, - const std::vector &scores, - std::vector *gold_output, - double *loss_inner) { - *loss_inner = 0.0; - } - - // Given a vector of parts of a desired output, builds the output information - // in the instance that corresponds to that output. - // Note: this function is task-specific and needs to be implemented by the - // deriving class. - virtual void LabelInstance(Parts *parts, const vector &output, - Instance *instance) = 0; - - // Preprocess an instance before training begins. Override this function for - // task-specific instance preprocessing. - virtual void PreprocessInstance(Instance* instance) {}; - - // Preprocess the data before training begins. Override this function for - // task-specific instance preprocessing. - virtual void PreprocessData() {}; - - // Build and lock a parameter vector with only supported parameters, by - // looking at the gold outputs in the training data. This is a preprocessing - // stage for training with supported features (flag - // --only_supported_features). - void MakeSupportedParameters(); - - // Run one epoch of training. - void TrainEpoch(int epoch); - - // Start all the evaluation counters for evaluating the classifier, - // evaluate each instance, and plot evaluation information at the end. - // This is done at test time when the flag --evaluate is activated. - // The version implemented here plots accuracy based on Hamming distance - // for the predicted and gold parts. Override this function for - // task-specific evaluation. - virtual void BeginEvaluation() { num_mistakes_ = 0; num_total_parts_ = 0; } - virtual void EvaluateInstance(Instance *instance, - Instance *output_instance, - Parts *parts, - const vector &gold_outputs, - const vector &predicted_outputs) { - for (int r = 0; r < parts->size(); ++r) { - if (!NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { - ++num_mistakes_; - } - ++num_total_parts_; - } - } - virtual void EndEvaluation() { - LOG(INFO) << "Accuracy (parts): " << - static_cast(num_total_parts_ - num_mistakes_) / - static_cast(num_total_parts_); - } - -protected: - Options *options_; // Classifier options. - Dictionary *dictionary_; // Dictionary for the classifier. - Reader *reader_; // Reader for reading instances from a file. - Writer *writer_; // Writer for writing instance to a file. - Decoder *decoder_; // Decoder for this classification task. - Parameters *parameters_; // Parameter vector. - vector instances_; // Set of instances. - - // Number of mistakes and number of total parts at test time (used for - // evaluation purposes). - int num_mistakes_; - int num_total_parts_; -}; - -#endif /* PIPE_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef PIPE_H_ +#define PIPE_H_ + +#include "Dictionary.h" +#include "Features.h" +#include "Part.h" +#include "Reader.h" +#include "Writer.h" +#include "Options.h" +#include "Decoder.h" +#include "Parameters.h" +#include "AlgUtils.h" + +// Abstract class for the structured classifier mainframe. +// It requires parts, features, a dictionary, a reader and writer, and +// instances, all of which are abstract classes. +// Task-specific classifiers should derive from this class and implement the +// pure virtual methods. +class Pipe { +public: + // Constructor/destructor. + Pipe() {}; + Pipe(Options* options); + virtual ~Pipe(); + + // Save/load the model to/from a file. + void SaveModelFile() { SaveModelByName(options_->GetModelFilePath()); } + void LoadModelFile() { LoadModelByName(options_->GetModelFilePath()); } + + // Initialize. Override this method for task-specific initialization. + virtual void Initialize(); + + // Get options. + Options* GetOptions() { return options_; }; + + // Get/Set parameters. + Parameters *GetParameters() { return parameters_; } + void SetParameters(Parameters *parameters) { parameters_ = parameters; } + + // Train the classifier. + void Train(); + + // Run a previously trained classifier on new data. + void Run(); + + // Run a previously trained classifier on a single instance. + void ClassifyInstance(Instance *instance); + + // Get model version. + uint64_t GetModelVersion() { + LOG(INFO) << "model version: " << model_version_; + return model_version_; + } +protected: + // Create basic objects. + virtual void CreateDictionary() = 0; + virtual void CreateReader() = 0; + virtual void CreateWriter() = 0; + virtual void CreateDecoder() = 0; + virtual Parts *CreateParts() = 0; + virtual Features *CreateFeatures() = 0; + + // Save/load model. + void SaveModelByName(const std::string &model_name); + void LoadModelByName(const std::string &model_name); + virtual void SaveModel(FILE* fs); + virtual void LoadModel(FILE* fs); + + // Create/add/delete instances. + void DeleteInstances() { + for (int i = 0; i < instances_.size(); ++i) { + delete instances_[i]; + } + instances_.clear(); + } + + void AddInstance(Instance *instance) { + Instance *formatted_instance = GetFormattedInstance(instance); + instances_.push_back(formatted_instance); + if (instance != formatted_instance) delete instance; + } + + // Obtain a "formatted" instance. Override this function for task-specific + // formatted instances, which may be different from instance since they + // may have extra information, data in numeric format for faster + // processing, etc. + virtual Instance *GetFormattedInstance(Instance *instance) { + return instance; + } + + // Create a vector of instances by reading the training data. + void CreateInstances(); + + // Construct the vector of parts for a particular instance. + // Eventually, obtain the binary vector of gold outputs (one entry per part) + // if this information is available. + // Note: this function is task-specific and needs to be implemented by the + // deriving class. + virtual void MakeParts(Instance *instance, Parts *parts, + vector *gold_outputs) = 0; + + // Construct the vector of features for a particular instance and given the + // parts. The vector will be of the same size as the vector of parts. + void MakeFeatures(Instance *instance, Parts *parts, Features *features) { + vector selected_parts(parts->size(), true); + MakeSelectedFeatures(instance, parts, selected_parts, features); + } + + // Construct the vector of features for a particular instance and given a + // selected set of parts (parts which are selected as marked as true). The + // vector will be of the same size as the vector of parts. + // Note: this function is task-specific and needs to be implemented by the + // deriving class. + virtual void MakeSelectedFeatures(Instance *instance, Parts *parts, + const vector &selected_parts, + Features *features) = 0; + + // Given an instance, parts, and features, compute the scores. This will + // look at the current parameters. Each part will receive a score, so the + // vector of scores will be of the same size as the vector of parts. + // NOTE: Override this method for task-specific score computation (e.g. + // to handle labeled features, etc.). + // TODO: handle labeled features here instead of having to override. + virtual void ComputeScores(Instance *instance, Parts *parts, + Features *features, + vector *scores); + + // Perform a gradient step with stepsize eta. The iteration number is + // provided as input since it may be necessary to keep track of the averaged + // weights. The gold output and the predicted output are also provided. + // The meaning of "predicted_output" depends on the training algorithm. + // In perceptron, it is the most likely output predicted by the model. + // In cost-augmented MIRA and structured SVMs, it is the cost-augmented + // prediction. + // In CRFs, it is the vector of posterior marginals for the parts. + // TODO: use "FeatureVector *difference" as input (see function + // MakeFeatureDifference(...) instead of computing on the fly). + virtual void MakeGradientStep(Parts *parts, Features *features, double eta, + int iteration, + const vector &gold_output, + const vector &predicted_output); + + // Compute the difference between the predicted feature vector and the gold + // one. + // The meaning of "predicted_output" depends on the training algorithm. + // In perceptron, it is the most likely output predicted by the model. + // In cost-augmented MIRA and structured SVMs, it is the cost-augmented + // prediction. + // In CRFs, it is the vector of posterior marginals for the parts. + virtual void MakeFeatureDifference(Parts *parts, + Features *features, + const vector &gold_output, + const vector &predicted_output, + FeatureVector *difference); + + // Given an instance, a vector of parts, and features for those parts, + // remove all the features which are not supported, i.e., that were not + // previously created in the parameter vector. This is used for training + // with supported features (flag --only_supported_features). + void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, + Features *features) { + vector selected_parts(parts->size(), true); + RemoveUnsupportedFeatures(instance, parts, selected_parts, features); + } + + // Given an instance, a vector of selected parts, and features for those + // parts, remove all the features which are not supported. See description + // above. + virtual void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, + const vector &selected_parts, + Features *features); + + // Given a vector of parts, and features for those parts, "touch" all the + // parameters corresponding to those features. This will be a no-op for the + // parameters that exist already, and will create a parameter with a zero + // weight otherwise. This is used in a preprocessing stage for training + // with supported features (flag --only_supported_features). + virtual void TouchParameters(Parts *parts, Features *features, + const vector &selected_parts); + + // This is a no-op by default. But it's convenient to have it here to build + // latent-variable structured classifiers (e.g. for coreference resolution). + virtual void TransformGold(Instance *instance, + Parts *parts, + const std::vector &scores, + std::vector *gold_output, + double *loss_inner) { + *loss_inner = 0.0; + } + + // Given a vector of parts of a desired output, builds the output information + // in the instance that corresponds to that output. + // Note: this function is task-specific and needs to be implemented by the + // deriving class. + virtual void LabelInstance(Parts *parts, const vector &output, + Instance *instance) = 0; + + // Preprocess an instance before training begins. Override this function for + // task-specific instance preprocessing. + virtual void PreprocessInstance(Instance* instance) {}; + + // Preprocess the data before training begins. Override this function for + // task-specific instance preprocessing. + virtual void PreprocessData() {}; + + // Build and lock a parameter vector with only supported parameters, by + // looking at the gold outputs in the training data. This is a preprocessing + // stage for training with supported features (flag + // --only_supported_features). + void MakeSupportedParameters(); + + // Run one epoch of training. + void TrainEpoch(int epoch); + + // Start all the evaluation counters for evaluating the classifier, + // evaluate each instance, and plot evaluation information at the end. + // This is done at test time when the flag --evaluate is activated. + // The version implemented here plots accuracy based on Hamming distance + // for the predicted and gold parts. Override this function for + // task-specific evaluation. + virtual void BeginEvaluation() { num_mistakes_ = 0; num_total_parts_ = 0; } + virtual void EvaluateInstance(Instance *instance, + Instance *output_instance, + Parts *parts, + const vector &gold_outputs, + const vector &predicted_outputs) { + for (int r = 0; r < parts->size(); ++r) { + if (!NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { + ++num_mistakes_; + } + ++num_total_parts_; + } + } + virtual void EndEvaluation() { + LOG(INFO) << "Accuracy (parts): " << + static_cast(num_total_parts_ - num_mistakes_) / + static_cast(num_total_parts_); + } + +protected: + Options *options_; // Classifier options. + Dictionary *dictionary_; // Dictionary for the classifier. + Reader *reader_; // Reader for reading instances from a file. + Writer *writer_; // Writer for writing instance to a file. + Decoder *decoder_; // Decoder for this classification task. + Parameters *parameters_; // Parameter vector. + vector instances_; // Set of instances. + + // Number of mistakes and number of total parts at test time (used for + // evaluation purposes). + int num_mistakes_; + int num_total_parts_; + //Model check + uint64_t model_check_; + uint64_t model_version_; +}; + +#endif /* PIPE_H_ */ diff --git a/src/classifier/SparseLabeledParameterVector.h b/src/classifier/SparseLabeledParameterVector.h index c1fc15f..dc8bcc1 100644 --- a/src/classifier/SparseLabeledParameterVector.h +++ b/src/classifier/SparseLabeledParameterVector.h @@ -76,8 +76,10 @@ class SparseLabelWeights : public LabelWeights { SparseLabelWeights() {}; virtual ~SparseLabelWeights() {}; + SparseLabelWeights(const SparseLabelWeights & other) = default; + bool IsSparse() const { return true; } - int Size() const { return label_weights_.size(); } + int Size() const { return (int) label_weights_.size(); } double GetWeight(int label) const { for (int k = 0; k < label_weights_.size(); ++k) { @@ -169,7 +171,7 @@ class DenseLabelWeights : public LabelWeights { virtual ~DenseLabelWeights() {}; bool IsSparse() const { return false; } - int Size() const { return weights_.size(); } + int Size() const { return (int) weights_.size(); } double GetWeight(int label) const { if (label >= weights_.size()) return 0.0; @@ -268,6 +270,42 @@ class SparseLabeledParameterVector { values_.clear(); } + // Overwrite + void Overwrite(SparseLabeledParameterVector *output_parameters) { + output_parameters->scale_factor_ = scale_factor_; + output_parameters->squared_norm_ = squared_norm_; + output_parameters->growth_stopped_ = growth_stopped_; + + for (const auto & element : values_) { + if (element.second->IsSparse()){ + *(static_cast(output_parameters->values_[element.first])) = *(static_cast(element.second)); + }else{ + *(static_cast(output_parameters->values_[element.first])) = *(static_cast(element.second)); + } + + } + } + + // Copy + void Copy(SparseLabeledParameterVector *output_parameters){ + output_parameters->scale_factor_=scale_factor_; + output_parameters->squared_norm_=squared_norm_; + output_parameters->growth_stopped_=growth_stopped_; + + for (const auto & element : values_){ + if (element.second->IsSparse()){ + SparseLabelWeights *label_weights = new SparseLabelWeights; + *label_weights = *(static_cast(element.second)); + output_parameters->values_.insert(pair(element.first, label_weights)); + }else{ + DenseLabelWeights *label_weights = new DenseLabelWeights; + *label_weights = *(static_cast(element.second)); + output_parameters->values_.insert(pair(element.first, label_weights)); + } + } + } + + // Save/load the parameters to/from a file. void Save(FILE *fs) const { bool success; @@ -352,7 +390,7 @@ class SparseLabeledParameterVector { // Get the number of instantiated features. // This is the number of parameters up to different labels. - int Size() const { return values_.size(); } + int Size() const { return (int) values_.size(); } // True if this feature key is already instantiated. bool Exists(uint64_t key) const { diff --git a/src/classifier/SparseParameterVector.h b/src/classifier/SparseParameterVector.h index 415ccfc..2ece34c 100644 --- a/src/classifier/SparseParameterVector.h +++ b/src/classifier/SparseParameterVector.h @@ -96,6 +96,28 @@ class SparseParameterVector { void AllowGrowth() { growth_stopped_ = false; } bool growth_stopped() const { return growth_stopped_; } + // Overwrite + void Overwrite(SparseParameterVector *output_parameters) { + output_parameters->scale_factor_ = scale_factor_; + output_parameters->squared_norm_ = squared_norm_; + output_parameters->growth_stopped_ = growth_stopped_; + + for (const auto & element : values_) { + output_parameters->values_[element.first] = element.second; + } + } + + // Copy + void Copy(SparseParameterVector *output_parameters){ + output_parameters->scale_factor_=scale_factor_; + output_parameters->squared_norm_=squared_norm_; + output_parameters->growth_stopped_=growth_stopped_; + + for (const auto & element : values_){ + output_parameters->values_.insert(pair(element.first, element.second)); + } + } + // Save/load the parameters to/from a file. void Save(FILE *fs) const { bool success; @@ -138,7 +160,7 @@ class SparseParameterVector { } // Get the number of instantiated features. - int Size() const { return values_.size(); } + int Size() const { return (int) values_.size(); } // True if this feature key is already instantiated. bool Exists(uint64_t key) const { diff --git a/src/classifier/Writer.cpp b/src/classifier/Writer.cpp index 3f4302e..cda92d9 100644 --- a/src/classifier/Writer.cpp +++ b/src/classifier/Writer.cpp @@ -28,10 +28,17 @@ using namespace std; void Writer::Open(const string &filepath) { os_.open(filepath.c_str(), ifstream::out); CHECK(os_.good()) << "Could not open " << filepath << "."; + + os_formatted_.open(std::string(filepath+".formatted").c_str(), ifstream::out); + CHECK(os_formatted_.good()) << "Could not open " << std::string(filepath + ".formatted") << "."; } void Writer::Close() { os_.flush(); os_.clear(); os_.close(); + + os_formatted_.flush(); + os_formatted_.clear(); + os_formatted_.close(); } diff --git a/src/classifier/Writer.h b/src/classifier/Writer.h index b0e61ff..6d432f4 100644 --- a/src/classifier/Writer.h +++ b/src/classifier/Writer.h @@ -23,6 +23,8 @@ #include using namespace std; +class Pipe; + // Abstract class for the writer. Task-specific parts should derive // from this class and implement the pure virtual methods. // The writer writes instances to a file. @@ -35,9 +37,11 @@ class Writer { virtual void Open(const string &filepath); virtual void Close(); virtual void Write(Instance *instance) = 0; + virtual void WriteFormatted(Pipe * pipe, Instance *instance) = 0; protected: ofstream os_; + ofstream os_formatted_; }; #endif /* SHWRITER_H_ */ diff --git a/src/constituency_labeler/ConstituencyDictionary.cpp b/src/constituency_labeler/ConstituencyDictionary.cpp index 84dc9b9..d9ada91 100644 --- a/src/constituency_labeler/ConstituencyDictionary.cpp +++ b/src/constituency_labeler/ConstituencyDictionary.cpp @@ -1,181 +1,181 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "ConstituencyDictionary.h" -#include "ConstituencyLabelerPipe.h" -#include "ConstituencyInstance.h" -#include - -// Special symbols. -const string kConstituencyTokenUnknown = "_UNKNOWN_"; // Unknown word/lemma. -const string kConstituencyTokenStart = "_START_"; // Start symbol. -const string kConstituencyTokenStop = "_STOP_"; // Stop symbol. - -// Maximum alphabet sizes. -const unsigned int kConstituencyMaxLemmaAlphabetSize = 0xffff; -const unsigned int kConstituencyMaxMorphAlphabetSize = 0xfff; //0xffff; - -DEFINE_int32(constituency_lemma_cutoff, 0, - "Ignore word lemmas whose frequency is less than this."); -DEFINE_int32(constituency_morph_cutoff, 0, - "Ignore morphological features whose frequency is less than this."); - -void ConstituencyDictionary::CreateConstituentDictionary( - ConstituencyReader *reader) { - // Create tag dictionary. - CreateTagDictionary(reader); - - // Create constituent dictionary. - LOG(INFO) << "Creating constituent and rule dictionary..."; - std::vector label_freqs(constituent_alphabet_.size(), -1); - - int lemma_cutoff = FLAGS_constituency_lemma_cutoff; - int morph_cutoff = FLAGS_constituency_morph_cutoff; - std::vector lemma_freqs; - std::vector morph_freqs; - Alphabet lemma_alphabet; - Alphabet morph_alphabet; - - string special_symbols[NUM_SPECIAL_TOKENS]; - special_symbols[TOKEN_UNKNOWN] = kConstituencyTokenUnknown; - special_symbols[TOKEN_START] = kConstituencyTokenStart; - special_symbols[TOKEN_STOP] = kConstituencyTokenStop; - - for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { - lemma_alphabet.Insert(special_symbols[i]); - morph_alphabet.Insert(special_symbols[i]); - - // Counts of special symbols are set to -1: - lemma_freqs.push_back(-1); - morph_freqs.push_back(-1); - } - - // Go through the corpus and build the label dictionary, - // counting the frequencies. - reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); - ConstituencyInstance *instance = - static_cast(reader->GetNext()); - while (instance != NULL) { - // Word-level elements. - int instance_length = instance->size(); - for (int i = 0; i < instance_length; ++i) { - int id; - // Add lemma to alphabet. - id = lemma_alphabet.Insert(instance->GetLemma(i)); - if (id >= lemma_freqs.size()) { - CHECK_EQ(id, lemma_freqs.size()); - lemma_freqs.push_back(0); - } - ++lemma_freqs[id]; - - // Add FEATS to alphabet. - for (int j = 0; j < instance->GetNumMorphFeatures(i); ++j) { - id = morph_alphabet.Insert(instance->GetMorphFeature(i, j)); - if (id >= morph_freqs.size()) { - CHECK_EQ(id, morph_freqs.size()); - morph_freqs.push_back(0); - } - ++morph_freqs[id]; - } - } - - // Tree-level elements. - const ParseTree &tree = instance->GetParseTree(); - const std::vector &non_terminals = tree.non_terminals(); - int num_nodes = non_terminals.size(); - for (int i = 0; i < num_nodes; ++i) { - ParseTreeNode *node = non_terminals[i]; - const std::string &label = node->label(); - - // Add constituent to alphabet. - int id = constituent_alphabet_.Insert(label); - if (id >= label_freqs.size()) { - CHECK_EQ(id, label_freqs.size()); - label_freqs.push_back(0); - } - ++label_freqs[id]; - - // Add rule to alphabet. - if (!node->IsPreTerminal()) { - std::string rule = label + ":"; - for (int j = 0; j < node->GetNumChildren(); ++j) { - rule += " " + node->GetChild(j)->label(); - } - int rule_id = rule_alphabet_.Insert(rule); - } - } - delete instance; - instance = static_cast(reader->GetNext()); - } - reader->Close(); - constituent_alphabet_.StopGrowth(); - - // Now adjust the cutoffs if necessary. - while (true) { - lemma_alphabet_.clear(); - for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { - lemma_alphabet_.Insert(special_symbols[i]); - } - for (Alphabet::iterator iter = lemma_alphabet.begin(); - iter != lemma_alphabet.end(); - ++iter) { - if (lemma_freqs[iter->second] > lemma_cutoff) { - lemma_alphabet_.Insert(iter->first); - } - } - if (lemma_alphabet_.size() < kConstituencyMaxLemmaAlphabetSize) break; - ++lemma_cutoff; - LOG(INFO) << "Incrementing lemma cutoff to " << lemma_cutoff << "..."; - } - - while (true) { - morph_alphabet_.clear(); - for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { - morph_alphabet_.Insert(special_symbols[i]); - } - for (Alphabet::iterator iter = morph_alphabet.begin(); - iter != morph_alphabet.end(); - ++iter) { - if (morph_freqs[iter->second] > morph_cutoff) { - morph_alphabet_.Insert(iter->first); - } - } - if (morph_alphabet_.size() < kConstituencyMaxMorphAlphabetSize) break; - ++morph_cutoff; - LOG(INFO) << "Incrementing FEATS cutoff to " << morph_cutoff << "..."; - } - - lemma_alphabet_.StopGrowth(); - morph_alphabet_.StopGrowth(); - - CHECK_LT(lemma_alphabet_.size(), 0xffff); - CHECK_LT(morph_alphabet_.size(), 0xffff); - - LOG(INFO) << "Number of lemmas: " << lemma_alphabet_.size(); - LOG(INFO) << "Number of feats: " << morph_alphabet_.size(); - LOG(INFO) << "Number of constituent tags: " << constituent_alphabet_.size(); - LOG(INFO) << "Number of rules: " << rule_alphabet_.size(); - LOG(INFO) << "Constituent tags and their frequencies:"; - for (Alphabet::iterator it = constituent_alphabet_.begin(); - it != constituent_alphabet_.end(); - ++it) { - std::string label = it->first; - int label_id = it->second; - LOG(INFO) << label << "\t" << label_freqs[label_id]; - } -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "ConstituencyDictionary.h" +#include "ConstituencyLabelerPipe.h" +#include "ConstituencyInstance.h" +#include + +// Special symbols. +const string kConstituencyTokenUnknown = "_UNKNOWN_"; // Unknown word/lemma. +const string kConstituencyTokenStart = "_START_"; // Start symbol. +const string kConstituencyTokenStop = "_STOP_"; // Stop symbol. + +// Maximum alphabet sizes. +const unsigned int kConstituencyMaxLemmaAlphabetSize = 0xffff; +const unsigned int kConstituencyMaxMorphAlphabetSize = 0xfff; //0xffff; + +DEFINE_int32(constituency_lemma_cutoff, 0, + "Ignore word lemmas whose frequency is less than this."); +DEFINE_int32(constituency_morph_cutoff, 0, + "Ignore morphological features whose frequency is less than this."); + +void ConstituencyDictionary::CreateConstituentDictionary( + ConstituencyReader *reader) { + // Create tag dictionary. + CreateTagDictionary(reader); + + // Create constituent dictionary. + LOG(INFO) << "Creating constituent and rule dictionary..."; + std::vector label_freqs(constituent_alphabet_.size(), -1); + + int lemma_cutoff = FLAGS_constituency_lemma_cutoff; + int morph_cutoff = FLAGS_constituency_morph_cutoff; + std::vector lemma_freqs; + std::vector morph_freqs; + Alphabet lemma_alphabet; + Alphabet morph_alphabet; + + string special_symbols[NUM_SPECIAL_TOKENS]; + special_symbols[TOKEN_UNKNOWN] = kConstituencyTokenUnknown; + special_symbols[TOKEN_START] = kConstituencyTokenStart; + special_symbols[TOKEN_STOP] = kConstituencyTokenStop; + + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + lemma_alphabet.Insert(special_symbols[i]); + morph_alphabet.Insert(special_symbols[i]); + + // Counts of special symbols are set to -1: + lemma_freqs.push_back(-1); + morph_freqs.push_back(-1); + } + + // Go through the corpus and build the label dictionary, + // counting the frequencies. + reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); + ConstituencyInstance *instance = + static_cast(reader->GetNext()); + while (instance != NULL) { + // Word-level elements. + int instance_length = instance->size(); + for (int i = 0; i < instance_length; ++i) { + int id; + // Add lemma to alphabet. + id = lemma_alphabet.Insert(instance->GetLemma(i)); + if (id >= lemma_freqs.size()) { + CHECK_EQ(id, lemma_freqs.size()); + lemma_freqs.push_back(0); + } + ++lemma_freqs[id]; + + // Add FEATS to alphabet. + for (int j = 0; j < instance->GetNumMorphFeatures(i); ++j) { + id = morph_alphabet.Insert(instance->GetMorphFeature(i, j)); + if (id >= morph_freqs.size()) { + CHECK_EQ(id, morph_freqs.size()); + morph_freqs.push_back(0); + } + ++morph_freqs[id]; + } + } + + // Tree-level elements. + const ParseTree &tree = instance->GetParseTree(); + const std::vector &non_terminals = tree.non_terminals(); + int num_nodes = non_terminals.size(); + for (int i = 0; i < num_nodes; ++i) { + ParseTreeNode *node = non_terminals[i]; + const std::string &label = node->label(); + + // Add constituent to alphabet. + int id = constituent_alphabet_.Insert(label); + if (id >= label_freqs.size()) { + CHECK_EQ(id, label_freqs.size()); + label_freqs.push_back(0); + } + ++label_freqs[id]; + + // Add rule to alphabet. + if (!node->IsPreTerminal()) { + std::string rule = label + ":"; + for (int j = 0; j < node->GetNumChildren(); ++j) { + rule += " " + node->GetChild(j)->label(); + } + int rule_id = rule_alphabet_.Insert(rule); + } + } + delete instance; + instance = static_cast(reader->GetNext()); + } + reader->Close(); + constituent_alphabet_.StopGrowth(); + + // Now adjust the cutoffs if necessary. + while (true) { + lemma_alphabet_.clear(); + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + lemma_alphabet_.Insert(special_symbols[i]); + } + for (Alphabet::iterator iter = lemma_alphabet.begin(); + iter != lemma_alphabet.end(); + ++iter) { + if (lemma_freqs[iter->second] > lemma_cutoff) { + lemma_alphabet_.Insert(iter->first); + } + } + if (lemma_alphabet_.size() < kConstituencyMaxLemmaAlphabetSize) break; + ++lemma_cutoff; + LOG(INFO) << "Incrementing lemma cutoff to " << lemma_cutoff << "..."; + } + + while (true) { + morph_alphabet_.clear(); + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + morph_alphabet_.Insert(special_symbols[i]); + } + for (Alphabet::iterator iter = morph_alphabet.begin(); + iter != morph_alphabet.end(); + ++iter) { + if (morph_freqs[iter->second] > morph_cutoff) { + morph_alphabet_.Insert(iter->first); + } + } + if (morph_alphabet_.size() < kConstituencyMaxMorphAlphabetSize) break; + ++morph_cutoff; + LOG(INFO) << "Incrementing FEATS cutoff to " << morph_cutoff << "..."; + } + + lemma_alphabet_.StopGrowth(); + morph_alphabet_.StopGrowth(); + + CHECK_LT(lemma_alphabet_.size(), 0xffff); + CHECK_LT(morph_alphabet_.size(), 0xffff); + + LOG(INFO) << "Number of lemmas: " << lemma_alphabet_.size(); + LOG(INFO) << "Number of feats: " << morph_alphabet_.size(); + LOG(INFO) << "Number of constituent tags: " << constituent_alphabet_.size(); + LOG(INFO) << "Number of rules: " << rule_alphabet_.size(); + LOG(INFO) << "Constituent tags and their frequencies:"; + for (Alphabet::iterator it = constituent_alphabet_.begin(); + it != constituent_alphabet_.end(); + ++it) { + std::string label = it->first; + int label_id = it->second; + LOG(INFO) << label << "\t" << label_freqs[label_id]; + } +} diff --git a/src/constituency_labeler/ConstituencyLabelerPipe.h b/src/constituency_labeler/ConstituencyLabelerPipe.h index c15fa91..7d5ed0c 100644 --- a/src/constituency_labeler/ConstituencyLabelerPipe.h +++ b/src/constituency_labeler/ConstituencyLabelerPipe.h @@ -1,249 +1,248 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef CONSTITUENCYLABELERPIPE_H_ -#define CONSTITUENCYLABELERPIPE_H_ - -#include "Pipe.h" -#include "ConstituencyLabelerOptions.h" -#include "ConstituencyLabelerReader.h" -#include "ConstituencyLabelerDictionary.h" -#include "TokenDictionary.h" -#include "ConstituencyLabelerInstanceNumeric.h" -#include "ConstituencyLabelerWriter.h" -#include "ConstituencyLabelerPart.h" -#include "ConstituencyLabelerFeatures.h" -#include "ConstituencyLabelerDecoder.h" - -class ConstituencyLabelerPipe : public Pipe { -public: - ConstituencyLabelerPipe(Options* options) : Pipe(options) { - token_dictionary_ = NULL; - } - virtual ~ConstituencyLabelerPipe() { delete token_dictionary_; } - - ConstituencyLabelerReader *GetConstituencyReader() { - return static_cast(reader_); - }; - ConstituencyLabelerDictionary *GetConstituencyLabelerDictionary() { - return static_cast(dictionary_); - }; - ConstituencyLabelerOptions *GetConstituencyLabelerOptions() { - return static_cast(options_); - }; - -protected: - void CreateDictionary() { - dictionary_ = new ConstituencyLabelerDictionary(this); - GetConstituencyLabelerDictionary()->SetTokenDictionary(token_dictionary_); - } - void CreateReader() { reader_ = new ConstituencyLabelerReader; } - void CreateWriter() { writer_ = new ConstituencyLabelerWriter; } - void CreateDecoder() { decoder_ = new ConstituencyLabelerDecoder(this); }; - Parts *CreateParts() { return new ConstituencyLabelerParts; }; - Features *CreateFeatures() { return new ConstituencyLabelerFeatures(this); }; - - void CreateTokenDictionary() { - token_dictionary_ = new TokenDictionary(this); - }; - - void PreprocessData(); - - Instance *GetFormattedInstance(Instance *instance) { - ConstituencyLabelerInstanceNumeric *instance_numeric = - new ConstituencyLabelerInstanceNumeric; - instance_numeric->Initialize( - *GetConstituencyLabelerDictionary(), - static_cast(instance)); - return instance_numeric; - } - -protected: - void SaveModel(FILE* fs); - void LoadModel(FILE* fs); - - // Return the allowed labels for the i-th node. An empty vector means that all - // tags are allowed. - void GetAllowedLabels(Instance *instance, int i, - std::vector *allowed_labels) { - // Make constituent-label dictionary pruning. - allowed_labels->clear(); - bool prune_labels = GetConstituencyLabelerOptions()->prune_labels(); - if (!prune_labels) return; - - ConstituencyLabelerInstanceNumeric *sentence = - static_cast(instance); - ConstituencyLabelerDictionary *labeler_dictionary = - GetConstituencyLabelerDictionary(); - - int constituent_id = sentence->GetConstituentId(i); - *allowed_labels = - labeler_dictionary->GetConstituentLabels(constituent_id); - } - - void MakeParts(Instance *instance, Parts *parts, - std::vector *gold_outputs); - - void MakeNodeParts(Instance *instance, - Parts *parts, - std::vector *gold_outputs); - - void MakeSelectedFeatures(Instance *instance, Parts *parts, - const std::vector &selected_parts, - Features *features); - - void ComputeScores(Instance *instance, Parts *parts, Features *features, - std::vector *scores); - - void MakeFeatureDifference(Parts *parts, - Features *features, - const std::vector &gold_output, - const std::vector &predicted_output, - FeatureVector *difference); - - void MakeGradientStep(Parts *parts, - Features *features, - double eta, - int iteration, - const std::vector &gold_output, - const std::vector &predicted_output); - - void LabelInstance(Parts *parts, const std::vector &output, - Instance *instance); - - void BeginEvaluation() { - num_tokens_ = 0; - num_constituents_ = 0; - num_matched_labels_ = 0; - num_predicted_labels_ = 0; - num_gold_labels_ = 0; - num_pruned_gold_labels_ = 0; - num_possible_labels_ = 0; - gettimeofday(&start_clock_, NULL); - } - void EvaluateInstance(Instance *instance, - Instance *output_instance, - Parts *parts, - const std::vector &gold_outputs, - const std::vector &predicted_outputs) { - ConstituencyLabelerInstance *labeler_instance = - static_cast(instance); - ConstituencyLabelerParts *labeler_parts = - static_cast(parts); - ConstituencyLabelerOptions *labeler_options = - static_cast(options_); - ConstituencyLabelerDictionary *labeler_dictionary = - static_cast(dictionary_); - const std::string &null_label = labeler_options->null_label(); - - int num_possible_labels = 0; - int num_gold_labels = 0; - int num_actual_gold_labels = 0; - for (int i = 0; i < labeler_instance->GetNumConstituents(); ++i) { - if (labeler_instance->GetConstituentLabel(i) != null_label) { - ++num_actual_gold_labels; - } - } - - for (int i = 0; i < labeler_instance->GetNumConstituents(); ++i) { - const vector& nodes = labeler_parts->FindNodeParts(i); - for (int k = 0; k < nodes.size(); ++k) { - int r = nodes[k]; - int label = - static_cast((*parts)[r])->label(); - - // Ignore if this is the null label. - if (label == labeler_dictionary->null_label()) continue; - - ++num_possible_labels; - if (gold_outputs[r] >= 0.5) { - CHECK_EQ(gold_outputs[r], 1.0); - if (NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { - ++num_matched_labels_; - } - ++num_gold_labels; - } - if (predicted_outputs[r] >= 0.5) { - CHECK_EQ(predicted_outputs[r], 1.0); - ++num_predicted_labels_; - } - } - ++num_constituents_; - } - num_tokens_ += labeler_instance->size(); - num_gold_labels_ += num_actual_gold_labels; - int missed_labels = num_actual_gold_labels - num_gold_labels; - - num_pruned_gold_labels_ += missed_labels; - num_possible_labels_ += num_possible_labels; - } - void EndEvaluation() { - double precision = - static_cast(num_matched_labels_) / - static_cast(num_predicted_labels_); - double recall = - static_cast(num_matched_labels_) / - static_cast(num_gold_labels_); - double F1 = 2.0 * precision * recall / (precision + recall); - double pruning_recall = - static_cast(num_gold_labels_ - - num_pruned_gold_labels_) / - static_cast(num_gold_labels_); - double pruning_efficiency = - static_cast(num_possible_labels_) / - static_cast(num_constituents_); - - LOG(INFO) << "Precision: " << precision - << " (" << num_matched_labels_ << "/" - << num_predicted_labels_ << ")"; - LOG(INFO) << "Recall: " << recall - << " (" << num_matched_labels_ << "/" - << num_gold_labels_ << ")"; - LOG(INFO) << "F1: " << F1; - LOG(INFO) << "Pruning recall: " << pruning_recall - << " (" - << num_gold_labels_ - num_pruned_gold_labels_ - << "/" - << num_gold_labels_ << ")"; - LOG(INFO) << "Pruning efficiency: " << pruning_efficiency - << " possible labels per node" - << " (" << num_possible_labels_ << "/" - << num_constituents_ << ")"; - - timeval end_clock; - gettimeofday(&end_clock, NULL); - double num_seconds = - static_cast(diff_ms(end_clock, start_clock_)) / 1000.0; - double tokens_per_second = static_cast(num_tokens_) / num_seconds; - LOG(INFO) << "Speed: " - << tokens_per_second << " tokens per second."; - } - -protected: - TokenDictionary *token_dictionary_; - int num_tokens_; - int num_constituents_; - int num_matched_labels_; - int num_predicted_labels_; - int num_gold_labels_; - int num_pruned_gold_labels_; +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef CONSTITUENCYLABELERPIPE_H_ +#define CONSTITUENCYLABELERPIPE_H_ + +#include "Pipe.h" +#include "TimeUtils.h" +#include "ConstituencyLabelerOptions.h" +#include "ConstituencyLabelerReader.h" +#include "ConstituencyLabelerDictionary.h" +#include "TokenDictionary.h" +#include "ConstituencyLabelerInstanceNumeric.h" +#include "ConstituencyLabelerWriter.h" +#include "ConstituencyLabelerPart.h" +#include "ConstituencyLabelerFeatures.h" +#include "ConstituencyLabelerDecoder.h" + +class ConstituencyLabelerPipe : public Pipe { +public: + ConstituencyLabelerPipe(Options* options) : Pipe(options) { + token_dictionary_ = NULL; + } + virtual ~ConstituencyLabelerPipe() { delete token_dictionary_; } + + ConstituencyLabelerReader *GetConstituencyReader() { + return static_cast(reader_); + }; + ConstituencyLabelerDictionary *GetConstituencyLabelerDictionary() { + return static_cast(dictionary_); + }; + ConstituencyLabelerOptions *GetConstituencyLabelerOptions() { + return static_cast(options_); + }; + +protected: + void CreateDictionary() { + dictionary_ = new ConstituencyLabelerDictionary(this); + GetConstituencyLabelerDictionary()->SetTokenDictionary(token_dictionary_); + } + void CreateReader() { reader_ = new ConstituencyLabelerReader; } + void CreateWriter() { writer_ = new ConstituencyLabelerWriter; } + void CreateDecoder() { decoder_ = new ConstituencyLabelerDecoder(this); }; + Parts *CreateParts() { return new ConstituencyLabelerParts; }; + Features *CreateFeatures() { return new ConstituencyLabelerFeatures(this); }; + + void CreateTokenDictionary() { + token_dictionary_ = new TokenDictionary(this); + }; + + void PreprocessData(); + + Instance *GetFormattedInstance(Instance *instance) { + ConstituencyLabelerInstanceNumeric *instance_numeric = + new ConstituencyLabelerInstanceNumeric; + instance_numeric->Initialize( + *GetConstituencyLabelerDictionary(), + static_cast(instance)); + return instance_numeric; + } + +protected: + void SaveModel(FILE* fs); + void LoadModel(FILE* fs); + + // Return the allowed labels for the i-th node. An empty vector means that all + // tags are allowed. + void GetAllowedLabels(Instance *instance, int i, + std::vector *allowed_labels) { + // Make constituent-label dictionary pruning. + allowed_labels->clear(); + bool prune_labels = GetConstituencyLabelerOptions()->prune_labels(); + if (!prune_labels) return; + + ConstituencyLabelerInstanceNumeric *sentence = + static_cast(instance); + ConstituencyLabelerDictionary *labeler_dictionary = + GetConstituencyLabelerDictionary(); + + int constituent_id = sentence->GetConstituentId(i); + *allowed_labels = + labeler_dictionary->GetConstituentLabels(constituent_id); + } + + void MakeParts(Instance *instance, Parts *parts, + std::vector *gold_outputs); + + void MakeNodeParts(Instance *instance, + Parts *parts, + std::vector *gold_outputs); + + void MakeSelectedFeatures(Instance *instance, Parts *parts, + const std::vector &selected_parts, + Features *features); + + void ComputeScores(Instance *instance, Parts *parts, Features *features, + std::vector *scores); + + void MakeFeatureDifference(Parts *parts, + Features *features, + const std::vector &gold_output, + const std::vector &predicted_output, + FeatureVector *difference); + + void MakeGradientStep(Parts *parts, + Features *features, + double eta, + int iteration, + const std::vector &gold_output, + const std::vector &predicted_output); + + void LabelInstance(Parts *parts, const std::vector &output, + Instance *instance); + + void BeginEvaluation() { + num_tokens_ = 0; + num_constituents_ = 0; + num_matched_labels_ = 0; + num_predicted_labels_ = 0; + num_gold_labels_ = 0; + num_pruned_gold_labels_ = 0; + num_possible_labels_ = 0; + chrono.GetTime(); + } + void EvaluateInstance(Instance *instance, + Instance *output_instance, + Parts *parts, + const std::vector &gold_outputs, + const std::vector &predicted_outputs) { + ConstituencyLabelerInstance *labeler_instance = + static_cast(instance); + ConstituencyLabelerParts *labeler_parts = + static_cast(parts); + ConstituencyLabelerOptions *labeler_options = + static_cast(options_); + ConstituencyLabelerDictionary *labeler_dictionary = + static_cast(dictionary_); + const std::string &null_label = labeler_options->null_label(); + + int num_possible_labels = 0; + int num_gold_labels = 0; + int num_actual_gold_labels = 0; + for (int i = 0; i < labeler_instance->GetNumConstituents(); ++i) { + if (labeler_instance->GetConstituentLabel(i) != null_label) { + ++num_actual_gold_labels; + } + } + + for (int i = 0; i < labeler_instance->GetNumConstituents(); ++i) { + const vector& nodes = labeler_parts->FindNodeParts(i); + for (int k = 0; k < nodes.size(); ++k) { + int r = nodes[k]; + int label = + static_cast((*parts)[r])->label(); + + // Ignore if this is the null label. + if (label == labeler_dictionary->null_label()) continue; + + ++num_possible_labels; + if (gold_outputs[r] >= 0.5) { + CHECK_EQ(gold_outputs[r], 1.0); + if (NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { + ++num_matched_labels_; + } + ++num_gold_labels; + } + if (predicted_outputs[r] >= 0.5) { + CHECK_EQ(predicted_outputs[r], 1.0); + ++num_predicted_labels_; + } + } + ++num_constituents_; + } + num_tokens_ += labeler_instance->size(); + num_gold_labels_ += num_actual_gold_labels; + int missed_labels = num_actual_gold_labels - num_gold_labels; + + num_pruned_gold_labels_ += missed_labels; + num_possible_labels_ += num_possible_labels; + } + void EndEvaluation() { + double precision = + static_cast(num_matched_labels_) / + static_cast(num_predicted_labels_); + double recall = + static_cast(num_matched_labels_) / + static_cast(num_gold_labels_); + double F1 = 2.0 * precision * recall / (precision + recall); + double pruning_recall = + static_cast(num_gold_labels_ - + num_pruned_gold_labels_) / + static_cast(num_gold_labels_); + double pruning_efficiency = + static_cast(num_possible_labels_) / + static_cast(num_constituents_); + + LOG(INFO) << "Precision: " << precision + << " (" << num_matched_labels_ << "/" + << num_predicted_labels_ << ")"; + LOG(INFO) << "Recall: " << recall + << " (" << num_matched_labels_ << "/" + << num_gold_labels_ << ")"; + LOG(INFO) << "F1: " << F1; + LOG(INFO) << "Pruning recall: " << pruning_recall + << " (" + << num_gold_labels_ - num_pruned_gold_labels_ + << "/" + << num_gold_labels_ << ")"; + LOG(INFO) << "Pruning efficiency: " << pruning_efficiency + << " possible labels per node" + << " (" << num_possible_labels_ << "/" + << num_constituents_ << ")"; + + chrono.StopTime(); + double num_seconds = chrono.GetElapsedTime(); + double tokens_per_second = static_cast(num_tokens_) / num_seconds; + LOG(INFO) << "Speed: " + << tokens_per_second << " tokens per second."; + } + +protected: + TokenDictionary *token_dictionary_; + int num_tokens_; + int num_constituents_; + int num_matched_labels_; + int num_predicted_labels_; + int num_gold_labels_; + int num_pruned_gold_labels_; int num_possible_labels_; - timeval start_clock_; -}; - -#endif /* CONSTITUENCYLABELERPIPE_H_ */ + chronowrap::Chronometer chrono; +}; + +#endif /* CONSTITUENCYLABELERPIPE_H_ */ diff --git a/src/constituency_labeler/ConstituencyLabelerWriter.cpp b/src/constituency_labeler/ConstituencyLabelerWriter.cpp index 6cc3750..e8afb3e 100644 --- a/src/constituency_labeler/ConstituencyLabelerWriter.cpp +++ b/src/constituency_labeler/ConstituencyLabelerWriter.cpp @@ -38,3 +38,5 @@ void ConstituencyLabelerWriter::Write(Instance *instance) { parse_tree->SaveToString(&info); os_ << info << endl; } + +void ConstituencyLabelerWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} diff --git a/src/constituency_labeler/ConstituencyLabelerWriter.h b/src/constituency_labeler/ConstituencyLabelerWriter.h index 940e069..b517f0b 100644 --- a/src/constituency_labeler/ConstituencyLabelerWriter.h +++ b/src/constituency_labeler/ConstituencyLabelerWriter.h @@ -28,6 +28,8 @@ class ConstituencyLabelerWriter : public ConstituencyWriter { public: void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); + }; #endif /* CONSTITUENCYLABELERWRITER_H_ */ diff --git a/src/constituency_labeler/ConstituencyWriter.cpp b/src/constituency_labeler/ConstituencyWriter.cpp index b7cd04d..7e937da 100644 --- a/src/constituency_labeler/ConstituencyWriter.cpp +++ b/src/constituency_labeler/ConstituencyWriter.cpp @@ -28,3 +28,5 @@ void ConstituencyWriter::Write(Instance *instance) { constituency_instance->GetParseTree().SaveToString(&info); os_ << info << endl; } + +void ConstituencyWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} \ No newline at end of file diff --git a/src/constituency_labeler/ConstituencyWriter.h b/src/constituency_labeler/ConstituencyWriter.h index ddbc6e9..43065df 100644 --- a/src/constituency_labeler/ConstituencyWriter.h +++ b/src/constituency_labeler/ConstituencyWriter.h @@ -28,6 +28,7 @@ class ConstituencyWriter : public Writer { public: void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); }; #endif /* CONSTITUENCYWRITER_H_ */ diff --git a/src/constituency_labeler/TurboConstituencyLabeler.cpp b/src/constituency_labeler/TurboConstituencyLabeler.cpp index c046070..28fc028 100644 --- a/src/constituency_labeler/TurboConstituencyLabeler.cpp +++ b/src/constituency_labeler/TurboConstituencyLabeler.cpp @@ -1,101 +1,99 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "ConstituencyLabelerPipe.h" - -void TrainConstituencyLabeler(); -void TestConstituencyLabeler(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_train) { - LOG(INFO) << "Training constituency labeler..." << endl; - TrainConstituencyLabeler(); - } else if (FLAGS_test) { - LOG(INFO) << "Running constituency labeler..." << endl; - TestConstituencyLabeler(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainConstituencyLabeler() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - ConstituencyLabelerOptions *options = new ConstituencyLabelerOptions; - options->Initialize(); - - ConstituencyLabelerPipe *pipe = new ConstituencyLabelerPipe(options); - pipe->Initialize(); - - LOG(INFO) << "Training the constituency labeler..."; - pipe->Train(); - pipe->SaveModelFile(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TestConstituencyLabeler() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - ConstituencyLabelerOptions *options = new ConstituencyLabelerOptions; - options->Initialize(); - - ConstituencyLabelerPipe *pipe = new ConstituencyLabelerPipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - pipe->Run(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "ConstituencyLabelerPipe.h" + +void TrainConstituencyLabeler(); +void TestConstituencyLabeler(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_train) { + LOG(INFO) << "Training constituency labeler..." << endl; + TrainConstituencyLabeler(); + } else if (FLAGS_test) { + LOG(INFO) << "Running constituency labeler..." << endl; + TestConstituencyLabeler(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainConstituencyLabeler() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + ConstituencyLabelerOptions *options = new ConstituencyLabelerOptions; + options->Initialize(); + + ConstituencyLabelerPipe *pipe = new ConstituencyLabelerPipe(options); + pipe->Initialize(); + + LOG(INFO) << "Training the constituency labeler..."; + pipe->Train(); + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestConstituencyLabeler() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + ConstituencyLabelerOptions *options = new ConstituencyLabelerOptions; + options->Initialize(); + + ConstituencyLabelerPipe *pipe = new ConstituencyLabelerPipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/coreference_resolver/CoreferenceDecoder.cpp b/src/coreference_resolver/CoreferenceDecoder.cpp index 7255859..53b1e6d 100644 --- a/src/coreference_resolver/CoreferenceDecoder.cpp +++ b/src/coreference_resolver/CoreferenceDecoder.cpp @@ -1,275 +1,275 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "CoreferenceDecoder.h" -#include "CoreferencePart.h" -#include "CoreferencePipe.h" -#include -#include "logval.h" - -// Define a matrix of doubles using Eigen. -typedef LogVal LogValD; -namespace Eigen { -typedef Eigen::Matrix MatrixXlogd; -} - -void CoreferenceDecoder::ComputeLinearCostFunction( - Instance *instance, - Parts *parts, - const std::vector &gold_output, - std::vector *p, - double *q) { - CoreferenceDocumentNumeric *document = - static_cast(instance); - CoreferenceParts *coreference_parts = static_cast(parts); - int num_parts = coreference_parts->size(); - const std::vector &mentions = document->GetMentions(); - - // The cost fucntion is = p'*z + q. - *q = 0.0; // Always 0.0 in this case. - p->assign(num_parts, 0.0); - - for (int j = 0; j < mentions.size(); ++j) { - int entity_id = mentions[j]->id(); // Gold entity (-1 if not gold mention). - // List all possible antecedents. - const std::vector &arcs = coreference_parts->FindArcParts(j); - for (int k = 0; k < arcs.size(); ++k) { - int r = arcs[k]; - int i = static_cast((*parts)[r])->parent_mention(); - // Gold parent entity. - int parent_entity_id = (i >= 0) ? mentions[i]->id() : -1; - if (!document->IsMentionAnaphoric(j) && i >= 0) { - // Mention j starts a gold cluster, but this part doesn't. - (*p)[r] = pipe_->GetCoreferenceOptions()->false_anaphor_cost(); - } else if (document->IsMentionAnaphoric(j) && i < 0) { - // Mention j does not start a gold cluster, but this part does. - (*p)[r] = pipe_->GetCoreferenceOptions()->false_new_cost(); - } else if (document->IsMentionAnaphoric(j) && - entity_id != parent_entity_id) { - // Neither mention j or this part start a gold cluster, but the - // antecedents are different. - CHECK_GE(i, 0); - (*p)[r] = pipe_->GetCoreferenceOptions()->false_wrong_link_cost(); - } else { - (*p)[r] = 0.0; - } - } - } -} - -void CoreferenceDecoder::DecodeCostAugmented( - Instance *instance, Parts *parts, - const std::vector &scores, - const std::vector &gold_output, - std::vector *predicted_output, - double *cost, - double *loss) { - CoreferenceParts *coreference_parts = static_cast(parts); - int num_parts = parts->size(); - - std::vector p; - double q; - ComputeLinearCostFunction(instance, parts, gold_output, &p, &q); - - std::vector scores_cost = scores; - for (int r = 0; r < num_parts; ++r) { - scores_cost[r] += p[r]; - } - - Decode(instance, parts, scores_cost, predicted_output); - - *cost = q; - for (int r = 0; r < num_parts; ++r) { - *cost += p[r] * (*predicted_output)[r]; - } - - *loss = *cost; - for (int r = 0; r < num_parts; ++r) { - *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); - } -} - -void CoreferenceDecoder::DecodeCostAugmentedMarginals( - Instance *instance, - Parts *parts, - const std::vector &scores, - const std::vector &gold_output, - std::vector *predicted_output, - double *entropy, - double *cost, - double *loss) { - CoreferenceDocumentNumeric *document = - static_cast(instance); - CoreferenceParts *coreference_parts = static_cast(parts); - int num_parts = coreference_parts->size(); - - predicted_output->clear(); - predicted_output->resize(num_parts, 0.0); - - std::vector p; - double q; - ComputeLinearCostFunction(instance, parts, gold_output, &p, &q); - - std::vector scores_cost = scores; - for (int r = 0; r < num_parts; ++r) { - scores_cost[r] += p[r]; - } - - double log_partition_function; - DecodeBasicMarginals(instance, parts, scores_cost, predicted_output, - &log_partition_function, entropy); - - *cost = q; - for (int r = 0; r < num_parts; ++r) { - *cost += p[r] * (*predicted_output)[r]; - } - - *loss = *cost; - for (int r = 0; r < num_parts; ++r) { - *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); - } - - *loss += *entropy; - if (*loss < 0.0) { - LOG(INFO) << "Loss truncated to zero (" << *loss << ")"; - *loss = 0.0; - } -} - -void CoreferenceDecoder::Decode(Instance *instance, Parts *parts, - const std::vector &scores, - std::vector *predicted_output) { - CoreferenceDocumentNumeric *document = - static_cast(instance); - CoreferenceParts *coreference_parts = static_cast(parts); - - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - - const std::vector &mentions = document->GetMentions(); - for (int j = 0; j < mentions.size(); ++j) { - // List all possible antecedents and pick the one with highest score. - const std::vector &arcs = coreference_parts->FindArcParts(j); - int best_antecedent = -1; - for (int k = 0; k < arcs.size(); ++k) { - int r = arcs[k]; - if (best_antecedent < 0 || scores[r] > scores[best_antecedent]) { - best_antecedent = r; - } - } - CHECK_GE(best_antecedent, 0); - (*predicted_output)[best_antecedent] = 1.0; - } -} - -void CoreferenceDecoder::DecodeMarginals(Instance *instance, Parts *parts, - const std::vector &scores, - const std::vector &gold_output, - std::vector *predicted_output, - double *entropy, - double *loss) { - CoreferenceDocumentNumeric *document = - static_cast(instance); - CoreferenceParts *coreference_parts = static_cast(parts); - - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - - int offset_arcs = 0; - int num_arcs = coreference_parts->size(); - - double log_partition_function; - DecodeBasicMarginals(instance, parts, scores, predicted_output, - &log_partition_function, entropy); - - *loss = 0.0; - for (int r = 0; r < parts->size(); ++r) { - *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); - } - - *loss += *entropy; - if (*loss < 0.0) { - LOG(INFO) << "Loss truncated to zero (" << *loss << ")"; - *loss = 0.0; - } -} - -// Compute marginals and evaluate log partition function for a coreference tree -// model. -void CoreferenceDecoder::DecodeBasicMarginals( - Instance *instance, Parts *parts, - const std::vector &scores, - std::vector *predicted_output, - double *log_partition_function, - double *entropy) { - CoreferenceDocumentNumeric *document = - static_cast(instance); - CoreferenceParts *coreference_parts = static_cast(parts); - - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - - *log_partition_function = 0.0; - *entropy = 0.0; - const std::vector &mentions = document->GetMentions(); - for (int j = 0; j < mentions.size(); ++j) { - // List all possible antecedents and pick the one with highest score. - const std::vector &arcs = coreference_parts->FindArcParts(j); - int best_antecedent = -1; - // Find the best label for each candidate arc. - LogValD total_score = LogValD::Zero(); - //LOG(INFO) << "num_arcs = " << arcs.size(); - for (int k = 0; k < arcs.size(); ++k) { - int r = arcs[k]; - total_score += LogValD(scores[r], false); - //LOG(INFO) << "scores[" << r << "] = " << scores[r]; - } - //LOG(INFO) << "total score = " << total_score.logabs(); - *log_partition_function += total_score.logabs(); - double sum = 0.0; - for (int k = 0; k < arcs.size(); ++k) { - int r = arcs[k]; - LogValD marginal = LogValD(scores[r], false) / total_score; - double marginal_value = marginal.as_float(); - (*predicted_output)[r] = marginal_value; -#if 0 - if (marginal_value > 0.0) { - LOG(INFO) << "Marginal[" << j << ", " - << static_cast((*parts)[r])->parent_mention() - << "] = " << marginal_value; - } -#endif - if (scores[r] != -std::numeric_limits::infinity()) { - *entropy -= scores[r] * marginal_value; - } else { - CHECK_EQ(marginal_value, 0.0); - } - sum += marginal_value; - } - if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) { - LOG(INFO) << "Antecedent marginals don't sum to one: sum = " << sum; - } - } - - *entropy += *log_partition_function; - -#if 0 - LOG(INFO) << "Log-partition function: " << *log_partition_function; - LOG(INFO) << "Entropy: " << *entropy; -#endif -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "CoreferenceDecoder.h" +#include "CoreferencePart.h" +#include "CoreferencePipe.h" +#include +#include "logval.h" + +// Define a matrix of doubles using Eigen. +typedef LogVal LogValD; +namespace Eigen { +typedef Eigen::Matrix MatrixXlogd; +} + +void CoreferenceDecoder::ComputeLinearCostFunction( + Instance *instance, + Parts *parts, + const std::vector &gold_output, + std::vector *p, + double *q) { + CoreferenceDocumentNumeric *document = + static_cast(instance); + CoreferenceParts *coreference_parts = static_cast(parts); + int num_parts = (int)coreference_parts->size(); + const std::vector &mentions = document->GetMentions(); + + // The cost fucntion is = p'*z + q. + *q = 0.0; // Always 0.0 in this case. + p->assign(num_parts, 0.0); + + for (int j = 0; j < mentions.size(); ++j) { + int entity_id = mentions[j]->id(); // Gold entity (-1 if not gold mention). + // List all possible antecedents. + const std::vector &arcs = coreference_parts->FindArcParts(j); + for (int k = 0; k < arcs.size(); ++k) { + int r = arcs[k]; + int i = static_cast((*parts)[r])->parent_mention(); + // Gold parent entity. + int parent_entity_id = (i >= 0) ? mentions[i]->id() : -1; + if (!document->IsMentionAnaphoric(j) && i >= 0) { + // Mention j starts a gold cluster, but this part doesn't. + (*p)[r] = pipe_->GetCoreferenceOptions()->false_anaphor_cost(); + } else if (document->IsMentionAnaphoric(j) && i < 0) { + // Mention j does not start a gold cluster, but this part does. + (*p)[r] = pipe_->GetCoreferenceOptions()->false_new_cost(); + } else if (document->IsMentionAnaphoric(j) && + entity_id != parent_entity_id) { + // Neither mention j or this part start a gold cluster, but the + // antecedents are different. + CHECK_GE(i, 0); + (*p)[r] = pipe_->GetCoreferenceOptions()->false_wrong_link_cost(); + } else { + (*p)[r] = 0.0; + } + } + } +} + +void CoreferenceDecoder::DecodeCostAugmented( + Instance *instance, Parts *parts, + const std::vector &scores, + const std::vector &gold_output, + std::vector *predicted_output, + double *cost, + double *loss) { + CoreferenceParts *coreference_parts = static_cast(parts); + int num_parts = (int)parts->size(); + + std::vector p; + double q; + ComputeLinearCostFunction(instance, parts, gold_output, &p, &q); + + std::vector scores_cost = scores; + for (int r = 0; r < num_parts; ++r) { + scores_cost[r] += p[r]; + } + + Decode(instance, parts, scores_cost, predicted_output); + + *cost = q; + for (int r = 0; r < num_parts; ++r) { + *cost += p[r] * (*predicted_output)[r]; + } + + *loss = *cost; + for (int r = 0; r < num_parts; ++r) { + *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); + } +} + +void CoreferenceDecoder::DecodeCostAugmentedMarginals( + Instance *instance, + Parts *parts, + const std::vector &scores, + const std::vector &gold_output, + std::vector *predicted_output, + double *entropy, + double *cost, + double *loss) { + CoreferenceDocumentNumeric *document = + static_cast(instance); + CoreferenceParts *coreference_parts = static_cast(parts); + int num_parts = (int)coreference_parts->size(); + + predicted_output->clear(); + predicted_output->resize(num_parts, 0.0); + + std::vector p; + double q; + ComputeLinearCostFunction(instance, parts, gold_output, &p, &q); + + std::vector scores_cost = scores; + for (int r = 0; r < num_parts; ++r) { + scores_cost[r] += p[r]; + } + + double log_partition_function; + DecodeBasicMarginals(instance, parts, scores_cost, predicted_output, + &log_partition_function, entropy); + + *cost = q; + for (int r = 0; r < num_parts; ++r) { + *cost += p[r] * (*predicted_output)[r]; + } + + *loss = *cost; + for (int r = 0; r < num_parts; ++r) { + *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); + } + + *loss += *entropy; + if (*loss < 0.0) { + LOG(INFO) << "Loss truncated to zero (" << *loss << ")"; + *loss = 0.0; + } +} + +void CoreferenceDecoder::Decode(Instance *instance, Parts *parts, + const std::vector &scores, + std::vector *predicted_output) { + CoreferenceDocumentNumeric *document = + static_cast(instance); + CoreferenceParts *coreference_parts = static_cast(parts); + + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + + const std::vector &mentions = document->GetMentions(); + for (int j = 0; j < mentions.size(); ++j) { + // List all possible antecedents and pick the one with highest score. + const std::vector &arcs = coreference_parts->FindArcParts(j); + int best_antecedent = -1; + for (int k = 0; k < arcs.size(); ++k) { + int r = arcs[k]; + if (best_antecedent < 0 || scores[r] > scores[best_antecedent]) { + best_antecedent = r; + } + } + CHECK_GE(best_antecedent, 0); + (*predicted_output)[best_antecedent] = 1.0; + } +} + +void CoreferenceDecoder::DecodeMarginals(Instance *instance, Parts *parts, + const std::vector &scores, + const std::vector &gold_output, + std::vector *predicted_output, + double *entropy, + double *loss) { + CoreferenceDocumentNumeric *document = + static_cast(instance); + CoreferenceParts *coreference_parts = static_cast(parts); + + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + + int offset_arcs = 0; + int num_arcs = (int)coreference_parts->size(); + + double log_partition_function; + DecodeBasicMarginals(instance, parts, scores, predicted_output, + &log_partition_function, entropy); + + *loss = 0.0; + for (int r = 0; r < parts->size(); ++r) { + *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); + } + + *loss += *entropy; + if (*loss < 0.0) { + LOG(INFO) << "Loss truncated to zero (" << *loss << ")"; + *loss = 0.0; + } +} + +// Compute marginals and evaluate log partition function for a coreference tree +// model. +void CoreferenceDecoder::DecodeBasicMarginals( + Instance *instance, Parts *parts, + const std::vector &scores, + std::vector *predicted_output, + double *log_partition_function, + double *entropy) { + CoreferenceDocumentNumeric *document = + static_cast(instance); + CoreferenceParts *coreference_parts = static_cast(parts); + + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + + *log_partition_function = 0.0; + *entropy = 0.0; + const std::vector &mentions = document->GetMentions(); + for (int j = 0; j < mentions.size(); ++j) { + // List all possible antecedents and pick the one with highest score. + const std::vector &arcs = coreference_parts->FindArcParts(j); + int best_antecedent = -1; + // Find the best label for each candidate arc. + LogValD total_score = LogValD::Zero(); + //LOG(INFO) << "num_arcs = " << arcs.size(); + for (int k = 0; k < arcs.size(); ++k) { + int r = arcs[k]; + total_score += LogValD(scores[r], false); + //LOG(INFO) << "scores[" << r << "] = " << scores[r]; + } + //LOG(INFO) << "total score = " << total_score.logabs(); + *log_partition_function += total_score.logabs(); + double sum = 0.0; + for (int k = 0; k < arcs.size(); ++k) { + int r = arcs[k]; + LogValD marginal = LogValD(scores[r], false) / total_score; + double marginal_value = marginal.as_float(); + (*predicted_output)[r] = marginal_value; +#if 0 + if (marginal_value > 0.0) { + LOG(INFO) << "Marginal[" << j << ", " + << static_cast((*parts)[r])->parent_mention() + << "] = " << marginal_value; + } +#endif + if (scores[r] != -std::numeric_limits::infinity()) { + *entropy -= scores[r] * marginal_value; + } else { + CHECK_EQ(marginal_value, 0.0); + } + sum += marginal_value; + } + if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) { + LOG(INFO) << "Antecedent marginals don't sum to one: sum = " << sum; + } + } + + *entropy += *log_partition_function; + +#if 0 + LOG(INFO) << "Log-partition function: " << *log_partition_function; + LOG(INFO) << "Entropy: " << *entropy; +#endif +} diff --git a/src/coreference_resolver/CoreferenceDeterminer.h b/src/coreference_resolver/CoreferenceDeterminer.h index 626eaa0..f7e5ce1 100644 --- a/src/coreference_resolver/CoreferenceDeterminer.h +++ b/src/coreference_resolver/CoreferenceDeterminer.h @@ -1,152 +1,152 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef COREFERENCEDETERMINER_H_ -#define COREFERENCEDETERMINER_H_ - -// TODO(atm): there is a lot of redundancy here with CoreferenceDeterminer. -// We should refactor this some time later. -struct CoreferenceDeterminerNumber { - enum types { - SINGULAR = 0, - PLURAL, - UNDEFINED, - COUNT - }; -}; - -struct CoreferenceDeterminerGender { - enum types { - MALE = 0, - FEMALE, - NEUTRAL, - UNDEFINED, - COUNT - }; -}; - -class CoreferenceDeterminer { -public: - CoreferenceDeterminer() { ClearFlags(); } - CoreferenceDeterminer(const std::string &code_flags) { SetFlags(code_flags); } - virtual ~CoreferenceDeterminer() {} - - void Save(FILE *fs) { - bool success; - success = WriteUINT8(fs, number_flag_); - CHECK(success); - success = WriteUINT8(fs, gender_flag_); - CHECK(success); - } - - void Load(FILE *fs) { - bool success; - success = ReadUINT8(fs, &number_flag_); - CHECK(success); - success = ReadUINT8(fs, &gender_flag_); - CHECK(success); - } - - uint8_t number_flag() { return number_flag_; } - uint8_t gender_flag() { return gender_flag_; } - - void ClearFlags() { - number_flag_ = 0x0; - gender_flag_ = 0x0; - } - - void SetFlags(const std::string &code_flags) { - CHECK_EQ(code_flags.length(), 2); - - ClearFlags(); - char ch = code_flags[0]; // Number flag. - if (ch == 's') { - SetNumberSingular(); - } else if (ch == 'p') { - SetNumberPlural(); - } else if (ch == 'x') { - SetNumberUndefined(); - } else { - CHECK(false) << "Invalid number flag: " << ch; - } - - ch = code_flags[1]; // Gender flag. - if (ch == 'm') { - SetGenderMale(); - } else if (ch == 'f') { - SetGenderFemale(); - } else if (ch == 'n') { - SetGenderNeutral(); - } else if (ch == 'x') { - SetGenderUndefined(); - } else { - CHECK(false) << "Invalid gender flag: " << ch; - } - } - -public: - bool IsNumberSingular() { - return number_flag_ & (0x1 << CoreferenceDeterminerNumber::SINGULAR); - } - bool IsNumberPlural() { - return number_flag_ & (0x1 << CoreferenceDeterminerNumber::PLURAL); - } - bool IsNumberUndefined() { - return number_flag_ & (0x1 << CoreferenceDeterminerNumber::UNDEFINED); - } - bool IsGenderMale() { - return gender_flag_ & (0x1 << CoreferenceDeterminerGender::MALE); - } - bool IsGenderFemale() { - return gender_flag_ & (0x1 << CoreferenceDeterminerGender::FEMALE); - } - bool IsGenderNeutral() { - return gender_flag_ & (0x1 << CoreferenceDeterminerGender::NEUTRAL); - } - bool IsGenderUndefined() { - return gender_flag_ & (0x1 << CoreferenceDeterminerGender::UNDEFINED); - } - - void SetNumberSingular() { - number_flag_ |= (0x1 << CoreferenceDeterminerNumber::SINGULAR); - } - void SetNumberPlural() { - number_flag_ |= (0x1 << CoreferenceDeterminerNumber::PLURAL); - } - void SetNumberUndefined() { - number_flag_ |= (0x1 << CoreferenceDeterminerNumber::UNDEFINED); - } - void SetGenderMale() { - gender_flag_ |= (0x1 << CoreferenceDeterminerGender::MALE); - } - void SetGenderFemale() { - gender_flag_ |= (0x1 << CoreferenceDeterminerGender::FEMALE); - } - void SetGenderNeutral() { - gender_flag_ |= (0x1 << CoreferenceDeterminerGender::NEUTRAL); - } - void SetGenderUndefined() { - gender_flag_ |= (0x1 << CoreferenceDeterminerGender::UNDEFINED); - } - -protected: - uint8_t number_flag_; - uint8_t gender_flag_; -}; - -#endif /* COREFERENCEDETERMINER_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef COREFERENCEDETERMINER_H_ +#define COREFERENCEDETERMINER_H_ + +// TODO(atm): there is a lot of redundancy here with CoreferenceDeterminer. +// We should refactor this some time later. +struct CoreferenceDeterminerNumber { + enum types { + SINGULAR = 0, + PLURAL, + UNDEFINED, + COUNT + }; +}; + +struct CoreferenceDeterminerGender { + enum types { + MALE = 0, + FEMALE, + NEUTRAL, + UNDEFINED, + COUNT + }; +}; + +class CoreferenceDeterminer { +public: + CoreferenceDeterminer() { ClearFlags(); } + CoreferenceDeterminer(const std::string &code_flags) { SetFlags(code_flags); } + virtual ~CoreferenceDeterminer() {} + + void Save(FILE *fs) { + bool success; + success = WriteUINT8(fs, number_flag_); + CHECK(success); + success = WriteUINT8(fs, gender_flag_); + CHECK(success); + } + + void Load(FILE *fs) { + bool success; + success = ReadUINT8(fs, &number_flag_); + CHECK(success); + success = ReadUINT8(fs, &gender_flag_); + CHECK(success); + } + + uint8_t number_flag() { return number_flag_; } + uint8_t gender_flag() { return gender_flag_; } + + void ClearFlags() { + number_flag_ = 0x0; + gender_flag_ = 0x0; + } + + void SetFlags(const std::string &code_flags) { + CHECK_EQ(code_flags.length(), 2); + + ClearFlags(); + char ch = code_flags[0]; // Number flag. + if (ch == 's') { + SetNumberSingular(); + } else if (ch == 'p') { + SetNumberPlural(); + } else if (ch == 'x') { + SetNumberUndefined(); + } else { + CHECK(false) << "Invalid number flag: " << ch; + } + + ch = code_flags[1]; // Gender flag. + if (ch == 'm') { + SetGenderMale(); + } else if (ch == 'f') { + SetGenderFemale(); + } else if (ch == 'n') { + SetGenderNeutral(); + } else if (ch == 'x') { + SetGenderUndefined(); + } else { + CHECK(false) << "Invalid gender flag: " << ch; + } + } + +public: + bool IsNumberSingular() { + return (number_flag_ & (0x1 << CoreferenceDeterminerNumber::SINGULAR)) != 0; + } + bool IsNumberPlural() { + return (number_flag_ & (0x1 << CoreferenceDeterminerNumber::PLURAL)) != 0; + } + bool IsNumberUndefined() { + return (number_flag_ & (0x1 << CoreferenceDeterminerNumber::UNDEFINED)) != 0; + } + bool IsGenderMale() { + return (gender_flag_ & (0x1 << CoreferenceDeterminerGender::MALE)) != 0; + } + bool IsGenderFemale() { + return (gender_flag_ & (0x1 << CoreferenceDeterminerGender::FEMALE)) != 0; + } + bool IsGenderNeutral() { + return (gender_flag_ & (0x1 << CoreferenceDeterminerGender::NEUTRAL)) != 0; + } + bool IsGenderUndefined() { + return (gender_flag_ & (0x1 << CoreferenceDeterminerGender::UNDEFINED) ) != 0; + } + + void SetNumberSingular() { + number_flag_ |= (0x1 << CoreferenceDeterminerNumber::SINGULAR); + } + void SetNumberPlural() { + number_flag_ |= (0x1 << CoreferenceDeterminerNumber::PLURAL); + } + void SetNumberUndefined() { + number_flag_ |= (0x1 << CoreferenceDeterminerNumber::UNDEFINED); + } + void SetGenderMale() { + gender_flag_ |= (0x1 << CoreferenceDeterminerGender::MALE); + } + void SetGenderFemale() { + gender_flag_ |= (0x1 << CoreferenceDeterminerGender::FEMALE); + } + void SetGenderNeutral() { + gender_flag_ |= (0x1 << CoreferenceDeterminerGender::NEUTRAL); + } + void SetGenderUndefined() { + gender_flag_ |= (0x1 << CoreferenceDeterminerGender::UNDEFINED); + } + +protected: + uint8_t number_flag_; + uint8_t gender_flag_; +}; + +#endif /* COREFERENCEDETERMINER_H_ */ diff --git a/src/coreference_resolver/CoreferenceDictionary.cpp b/src/coreference_resolver/CoreferenceDictionary.cpp index 9af157f..2fe1a20 100644 --- a/src/coreference_resolver/CoreferenceDictionary.cpp +++ b/src/coreference_resolver/CoreferenceDictionary.cpp @@ -213,7 +213,7 @@ void CoreferenceDictionary::CreateMentionWordDictionaries( // Add head word to alphabet. i = mention->head_index(); std::string form = instance->GetForm(i); - transform(form.begin(), form.end(), form.begin(), ::tolower); + std::transform(form.begin(), form.end(), form.begin(), ::tolower); id = head_word_alphabet_.Insert(form); if (id >= head_word_freqs.size()) { CHECK_EQ(id, head_word_freqs.size()); @@ -224,7 +224,7 @@ void CoreferenceDictionary::CreateMentionWordDictionaries( // Add first word to alphabet. i = mention->start(); std::string form = instance->GetForm(i); - transform(form.begin(), form.end(), form.begin(), ::tolower); + std::transform(form.begin(), form.end(), form.begin(), ::tolower); id = first_word_alphabet_.Insert(form); if (id >= first_word_freqs.size()) { CHECK_EQ(id, first_word_freqs.size()); @@ -235,7 +235,7 @@ void CoreferenceDictionary::CreateMentionWordDictionaries( // Add last word to alphabet. i = mention->end(); std::string form = instance->GetForm(i); - transform(form.begin(), form.end(), form.begin(), ::tolower); + std::transform(form.begin(), form.end(), form.begin(), ::tolower); id = last_word_alphabet_.Insert(form); if (id >= last_word_freqs.size()) { CHECK_EQ(id, last_word_freqs.size()); @@ -247,7 +247,7 @@ void CoreferenceDictionary::CreateMentionWordDictionaries( i = mention->start() - 1; if (i >= 0) { std::string form = instance->GetForm(i); - transform(form.begin(), form.end(), form.begin(), ::tolower); + std::transform(form.begin(), form.end(), form.begin(), ::tolower); id = previous_word_alphabet_.Insert(form); if (id >= previous_word_freqs.size()) { CHECK_EQ(id, previous_word_freqs.size()); @@ -260,7 +260,7 @@ void CoreferenceDictionary::CreateMentionWordDictionaries( i = mention->end() + 1; if (i < instance->size()) { std::string form = instance->GetForm(i); - transform(form.begin(), form.end(), form.begin(), ::tolower); + std::transform(form.begin(), form.end(), form.begin(), ::tolower); id = next_word_alphabet_.Insert(form); if (id >= next_word_freqs.size()) { CHECK_EQ(id, next_word_freqs.size()); @@ -418,8 +418,8 @@ void CoreferenceDictionary::CreateWordDictionaries( // Add form to alphabet. std::string form = instance->GetForm(i); std::string form_lower(form); - transform(form_lower.begin(), form_lower.end(), form_lower.begin(), - ::tolower); + std::transform(form_lower.begin(), form_lower.end(), form_lower.begin(), + ::tolower); id = word_alphabet_.Insert(form); if (id >= word_freqs.size()) { CHECK_EQ(id, word_freqs.size()); @@ -560,8 +560,8 @@ void CoreferenceDictionary::ReadGenderNumberStatistics() { for (int i = 0; i < words.size(); ++i) { const std::string &word = words[i]; std::string word_lower(word); - transform(word_lower.begin(), word_lower.end(), word_lower.begin(), - ::tolower); + std::transform(word_lower.begin(), word_lower.end(), word_lower.begin(), + ::tolower); int word_id = word_alphabet_.Insert(word); @@ -625,8 +625,8 @@ void CoreferenceDictionary::ReadPronouns() { const std::string &form = fields[0]; const std::string code_flags = fields[1]; std::string form_lower(form); - transform(form_lower.begin(), form_lower.end(), form_lower.begin(), - ::tolower); + std::transform(form_lower.begin(), form_lower.end(), form_lower.begin(), + ::tolower); int id = token_dictionary_->GetFormLowerId(form_lower); CHECK_LT(id, 0xffff); if (id < 0) { @@ -673,8 +673,8 @@ void CoreferenceDictionary::ReadPronouns() { const std::string &word = fields[0]; const std::string code_flags = fields[1]; std::string word_lower(word); - transform(word_lower.begin(), word_lower.end(), word_lower.begin(), - ::tolower); + std::transform(word_lower.begin(), word_lower.end(), word_lower.begin(), + ::tolower); int id = word_lower_alphabet_.Lookup(word_lower); if (id < 0) { LOG(INFO) << "Adding unknown pronoun: " @@ -733,8 +733,8 @@ void CoreferenceDictionary::ReadDeterminers() { const std::string &word = fields[0]; const std::string code_flags = fields[1]; std::string word_lower(word); - transform(word_lower.begin(), word_lower.end(), word_lower.begin(), - ::tolower); + std::transform(word_lower.begin(), word_lower.end(), word_lower.begin(), + ::tolower); int id = word_lower_alphabet_.Lookup(word_lower); if (id < 0) { LOG(INFO) << "Adding unknown determiner: " @@ -863,4 +863,4 @@ void CoreferenceDictionary::ReadMentionTags() { } is.close(); } -} +} diff --git a/src/coreference_resolver/CoreferenceDictionary.h b/src/coreference_resolver/CoreferenceDictionary.h index 21d5ca2..10cec09 100644 --- a/src/coreference_resolver/CoreferenceDictionary.h +++ b/src/coreference_resolver/CoreferenceDictionary.h @@ -1,607 +1,607 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef COREFERENCEDICTIONARY_H_ -#define COREFERENCEDICTIONARY_H_ - -#include -#include "Dictionary.h" -#include "TokenDictionary.h" -#include "DependencyDictionary.h" -#include "SemanticDictionary.h" -#include "SerializationUtils.h" -#include "CoreferenceReader.h" -#include "CoreferencePronoun.h" -#include "CoreferenceDeterminer.h" - -class Pipe; - -class GenderNumberStatistics { -public: - GenderNumberStatistics() {} - virtual ~GenderNumberStatistics() { Clear(); } - - void Clear() { phrase_counts_.clear(); } - - void Save(FILE *fs) { - bool success; - success = WriteInteger(fs, phrase_counts_.size()); - CHECK(success); - for (std::map, std::vector >::iterator it = - phrase_counts_.begin(); - it != phrase_counts_.end(); - ++it) { - success = WriteIntegerVector(fs, it->first); - CHECK(success); - success = WriteIntegerVector(fs, it->second); - CHECK(success); - } - } - - void Load(FILE *fs) { - bool success; - int length; - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - std::vector phrase; - success = ReadIntegerVector(fs, &phrase); - CHECK(success); - std::vector counts; - success = ReadIntegerVector(fs, &counts); - CHECK(success); - AddPhrase(phrase, counts); - } - } - - bool AddPhrase(const std::vector &phrase, - const std::vector &counts) { - if (phrase_counts_.find(phrase) == phrase_counts_.end()) { - phrase_counts_[phrase] = counts; - return true; - } else { - return false; - } - } - - int ComputeNumber(const std::vector &phrase, int head_index) const; - int ComputeGender(const std::vector &phrase, int head_index) const; - -protected: - std::map, std::vector > phrase_counts_; -}; - -class CoreferenceDictionary : public Dictionary { -public: - CoreferenceDictionary() {} - CoreferenceDictionary(Pipe* pipe) : pipe_(pipe) {} - virtual ~CoreferenceDictionary() { Clear(); } - - void Clear() { - // Don't clear token_dictionary, since this class does not own it. - entity_alphabet_.clear(); - constituent_alphabet_.clear(); - word_alphabet_.clear(); - word_lower_alphabet_.clear(); - unigram_ancestry_alphabet_.clear(); - bigram_ancestry_alphabet_.clear(); - - // TODO(atm): clear all the other stuff!!! - } - - void Save(FILE *fs) { - if (0 > entity_alphabet_.Save(fs)) CHECK(false); - if (0 > constituent_alphabet_.Save(fs)) CHECK(false); - if (0 > word_alphabet_.Save(fs)) CHECK(false); - if (0 > word_lower_alphabet_.Save(fs)) CHECK(false); - if (0 > unigram_ancestry_alphabet_.Save(fs)) CHECK(false); - if (0 > bigram_ancestry_alphabet_.Save(fs)) CHECK(false); - - // Save gender/number statistics. - gender_number_statistics_.Save(fs); - - // Save pronouns. - bool success; - int length = all_pronouns_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::map::iterator it = - all_pronouns_.begin(); - it != all_pronouns_.end(); - ++it) { - int id = it->first; - CoreferencePronoun *pronoun = it->second; - success = WriteInteger(fs, id); - CHECK(success); - pronoun->Save(fs); - } - - // Save determiners. - length = all_determiners_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::map::iterator it = - all_determiners_.begin(); - it != all_determiners_.end(); - ++it) { - int id = it->first; - CoreferenceDeterminer *determiner = it->second; - success = WriteInteger(fs, id); - CHECK(success); - determiner->Save(fs); - } - - // Save various tags. - length = named_entity_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::set::iterator it = named_entity_tags_.begin(); - it != named_entity_tags_.end(); - ++it) { - int id = *it; - success = WriteInteger(fs, id); - CHECK(success); - } - - length = person_entity_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::set::iterator it = person_entity_tags_.begin(); - it != person_entity_tags_.end(); - ++it) { - int id = *it; - success = WriteInteger(fs, id); - CHECK(success); - } - - length = noun_phrase_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::set::iterator it = noun_phrase_tags_.begin(); - it != noun_phrase_tags_.end(); - ++it) { - int id = *it; - success = WriteInteger(fs, id); - CHECK(success); - } - - length = proper_noun_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::set::iterator it = proper_noun_tags_.begin(); - it != proper_noun_tags_.end(); - ++it) { - int id = *it; - success = WriteInteger(fs, id); - CHECK(success); - } - - length = noun_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::set::iterator it = noun_tags_.begin(); - it != noun_tags_.end(); - ++it) { - int id = *it; - success = WriteInteger(fs, id); - CHECK(success); - } - - length = pronominal_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (std::set::iterator it = pronominal_tags_.begin(); - it != pronominal_tags_.end(); - ++it) { - int id = *it; - success = WriteInteger(fs, id); - CHECK(success); - } - } - - void Load(FILE *fs) { - if (0 > entity_alphabet_.Load(fs)) CHECK(false); - if (0 > constituent_alphabet_.Load(fs)) CHECK(false); - if (0 > word_alphabet_.Load(fs)) CHECK(false); - if (0 > word_lower_alphabet_.Load(fs)) CHECK(false); - if (0 > unigram_ancestry_alphabet_.Load(fs)) CHECK(false); - if (0 > bigram_ancestry_alphabet_.Load(fs)) CHECK(false); - entity_alphabet_.BuildNames(); - constituent_alphabet_.BuildNames(); - // TODO(atm): Remove this for memory efficiency. - word_alphabet_.BuildNames(); - word_lower_alphabet_.BuildNames(); - unigram_ancestry_alphabet_.BuildNames(); - bigram_ancestry_alphabet_.BuildNames(); - - // Load gender/number statistics. - gender_number_statistics_.Load(fs); - - // Load pronouns. - bool success; - int length; - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - CoreferencePronoun *pronoun = new CoreferencePronoun; - success = ReadInteger(fs, &id); - CHECK(success); - pronoun->Load(fs); - all_pronouns_[id] = pronoun; - } - - // Load determiners. - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - CoreferenceDeterminer *determiner = new CoreferenceDeterminer; - success = ReadInteger(fs, &id); - CHECK(success); - determiner->Load(fs); - all_determiners_[id] = determiner; - } - - // Load various tags. - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - success = ReadInteger(fs, &id); - CHECK(success); - named_entity_tags_.insert(id); - } - - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - success = ReadInteger(fs, &id); - CHECK(success); - person_entity_tags_.insert(id); - } - - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - success = ReadInteger(fs, &id); - CHECK(success); - noun_phrase_tags_.insert(id); - } - - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - success = ReadInteger(fs, &id); - CHECK(success); - proper_noun_tags_.insert(id); - } - - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - success = ReadInteger(fs, &id); - CHECK(success); - noun_tags_.insert(id); - } - - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int id; - success = ReadInteger(fs, &id); - CHECK(success); - pronominal_tags_.insert(id); - } - } - - void AllowGrowth() { - entity_alphabet_.AllowGrowth(); - constituent_alphabet_.AllowGrowth(); - word_alphabet_.AllowGrowth(); - word_lower_alphabet_.AllowGrowth(); - unigram_ancestry_alphabet_.AllowGrowth(); - bigram_ancestry_alphabet_.AllowGrowth(); - token_dictionary_->AllowGrowth(); - dependency_dictionary_->AllowGrowth(); - semantic_dictionary_->AllowGrowth(); - } - void StopGrowth() { - entity_alphabet_.StopGrowth(); - constituent_alphabet_.StopGrowth(); - word_alphabet_.StopGrowth(); - word_lower_alphabet_.StopGrowth(); - unigram_ancestry_alphabet_.StopGrowth(); - bigram_ancestry_alphabet_.StopGrowth(); - token_dictionary_->StopGrowth(); - dependency_dictionary_->StopGrowth(); - semantic_dictionary_->StopGrowth(); - } - - void CreateEntityDictionary(CoreferenceSentenceReader *reader); - - void CreateConstituentDictionary(CoreferenceSentenceReader *reader); - - void CreateWordDictionaries(CoreferenceSentenceReader *reader); - - void CreateAncestryDictionaries(CoreferenceSentenceReader *reader); - - void BuildEntityNames() { - entity_alphabet_.BuildNames(); - } - - void BuildConstituentNames() { - constituent_alphabet_.BuildNames(); - } - - void BuildWordNames() { - word_alphabet_.BuildNames(); - word_lower_alphabet_.BuildNames(); - } - - void BuildAncestryNames() { - unigram_ancestry_alphabet_.BuildNames(); - bigram_ancestry_alphabet_.BuildNames(); - } - - const string &GetEntityName(int tag) const { - return entity_alphabet_.GetName(tag); - } - - const string &GetConstituentName(int tag) const { - return constituent_alphabet_.GetName(tag); - } - - const string &GetWord(int word) const { - return word_alphabet_.GetName(word); - } - - const string &GetWordLower(int word) const { - return word_lower_alphabet_.GetName(word); - } - - const string &GetUnigramAncestry(int ancestry) const { - return unigram_ancestry_alphabet_.GetName(ancestry); - } - - const string &GetBigramAncestry(int ancestry) const { - return bigram_ancestry_alphabet_.GetName(ancestry); - } - - Pipe *GetPipe() const { return pipe_; } - - TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } - DependencyDictionary *GetDependencyDictionary() const { - return dependency_dictionary_; - } - SemanticDictionary *GetSemanticDictionary() const { - return semantic_dictionary_; - } - void SetTokenDictionary(TokenDictionary *token_dictionary) { - token_dictionary_ = token_dictionary; - } - void SetDependencyDictionary(DependencyDictionary *dependency_dictionary) { - dependency_dictionary_ = dependency_dictionary; - } - void SetSemanticDictionary(SemanticDictionary *semantic_dictionary) { - semantic_dictionary_ = semantic_dictionary; - } - - const Alphabet &GetConstituentAlphabet() const { - return constituent_alphabet_; - }; - - const Alphabet &GetEntityAlphabet() const { - return entity_alphabet_; - }; - - const Alphabet &GetWordAlphabet() const { - return word_alphabet_; - }; - - const Alphabet &GetWordLowerAlphabet() const { - return word_lower_alphabet_; - }; - - const Alphabet &GetUnigramAncestryAlphabet() const { - return unigram_ancestry_alphabet_; - }; - - const Alphabet &GetBigramAncestryAlphabet() const { - return bigram_ancestry_alphabet_; - }; - - const GenderNumberStatistics &GetGenderNumberStatistics() const { - return gender_number_statistics_; - }; - - void ReadGenderNumberStatistics(); - void ReadMentionTags(); - void ReadPronouns(); - void ReadDeterminers(); - - bool IsNamedEntity(int entity_tag) const { - return named_entity_tags_.find(entity_tag) != named_entity_tags_.end(); - } - - bool IsPersonEntity(int entity_tag) const { - return person_entity_tags_.find(entity_tag) != person_entity_tags_.end(); - } - - bool IsNounPhrase(int constituent_tag) const { - return noun_phrase_tags_.find(constituent_tag) != noun_phrase_tags_.end(); - } - - bool IsProperNoun(int pos_tag) const { - return proper_noun_tags_.find(pos_tag) != proper_noun_tags_.end(); - } - - bool IsNoun(int pos_tag) const { - return noun_tags_.find(pos_tag) != noun_tags_.end(); - } - - bool IsPronounTag(int pos_tag) const { - return pronominal_tags_.find(pos_tag) != pronominal_tags_.end(); - } - - bool IsPronoun(int word_lower) const { - std::map::const_iterator it = - all_pronouns_.find(word_lower); - return it != all_pronouns_.end(); - } - - CoreferencePronoun *GetPronoun(int word_lower) const { - std::map::const_iterator it = - all_pronouns_.find(word_lower); - if (it == all_pronouns_.end()) return NULL; - return it->second; - } - - bool IsMalePronoun(int word_lower) const { - CoreferencePronoun *pronoun = GetPronoun(word_lower); - if (!pronoun) return false; - return pronoun->IsGenderMale(); - } - - bool IsFemalePronoun(int word_lower) const { - CoreferencePronoun *pronoun = GetPronoun(word_lower); - if (!pronoun) return false; - return pronoun->IsGenderFemale(); - } - - bool IsNeutralPronoun(int word_lower) const { - CoreferencePronoun *pronoun = GetPronoun(word_lower); - if (!pronoun) return false; - return pronoun->IsGenderNeutral(); - } - - bool IsSingularPronoun(int word_lower) const { - CoreferencePronoun *pronoun = GetPronoun(word_lower); - if (!pronoun) return false; - return pronoun->IsNumberSingular(); - } - - bool IsPluralPronoun(int word_lower) const { - CoreferencePronoun *pronoun = GetPronoun(word_lower); - if (!pronoun) return false; - return pronoun->IsNumberPlural(); - } - - bool IsDeterminer(int word_lower) const { - std::map::const_iterator it = - all_determiners_.find(word_lower); - return it != all_determiners_.end(); - } - - CoreferenceDeterminer *GetDeterminer(int word_lower) const { - std::map::const_iterator it = - all_determiners_.find(word_lower); - if (it == all_determiners_.end()) return NULL; - return it->second; - } - - bool IsMaleDeterminer(int word_lower) const { - CoreferenceDeterminer *determiner = GetDeterminer(word_lower); - if (!determiner) return false; - return determiner->IsGenderMale(); - } - - bool IsFemaleDeterminer(int word_lower) const { - CoreferenceDeterminer *determiner = GetDeterminer(word_lower); - if (!determiner) return false; - return determiner->IsGenderFemale(); - } - - bool IsNeutralDeterminer(int word_lower) const { - CoreferenceDeterminer *determiner = GetDeterminer(word_lower); - if (!determiner) return false; - return determiner->IsGenderNeutral(); - } - - bool IsSingularDeterminer(int word_lower) const { - CoreferenceDeterminer *determiner = GetDeterminer(word_lower); - if (!determiner) return false; - return determiner->IsNumberSingular(); - } - - bool IsPluralDeterminer(int word_lower) const { - CoreferenceDeterminer *determiner = GetDeterminer(word_lower); - if (!determiner) return false; - return determiner->IsNumberPlural(); - } - - // TODO(atm): this should not be here, but let us keep it for now... - void ComputeDependencyAncestryStrings( - DependencyInstance *instance, - int i, - std::string *unigram_ancestry_string, - std::string *bigram_ancestry_string) const; - -protected: - void DeleteAllPronouns() { - for (std::map::iterator it = - all_pronouns_.begin(); - it != all_pronouns_.end(); - ++it) { - delete it->second; - } - all_pronouns_.clear(); - } - - void DeleteAllDeterminers() { - for (std::map::iterator it = - all_determiners_.begin(); - it != all_determiners_.end(); - ++it) { - delete it->second; - } - all_determiners_.clear(); - } - -protected: - Pipe *pipe_; - TokenDictionary *token_dictionary_; - DependencyDictionary *dependency_dictionary_; - SemanticDictionary *semantic_dictionary_; - Alphabet entity_alphabet_; - Alphabet constituent_alphabet_; - // The two form alphabets below come in addition to the TokenDictionary's - // form alphabet. We have these additional alphabets here since we do not want - // a cutoff and we want to allow loading a lexicon (for gender/number - // computation). - Alphabet word_alphabet_; - Alphabet word_lower_alphabet_; - Alphabet unigram_ancestry_alphabet_; - Alphabet bigram_ancestry_alphabet_; - GenderNumberStatistics gender_number_statistics_; - std::map all_pronouns_; - std::map all_determiners_; - std::set named_entity_tags_; - std::set person_entity_tags_; - std::set noun_tags_; - std::set noun_phrase_tags_; - std::set proper_noun_tags_; - std::set pronominal_tags_; - //Alphabet tag_alphabet_; -}; - -#endif /* COREFERENCEDICTIONARY_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef COREFERENCEDICTIONARY_H_ +#define COREFERENCEDICTIONARY_H_ + +#include +#include "Dictionary.h" +#include "TokenDictionary.h" +#include "DependencyDictionary.h" +#include "SemanticDictionary.h" +#include "SerializationUtils.h" +#include "CoreferenceReader.h" +#include "CoreferencePronoun.h" +#include "CoreferenceDeterminer.h" + +class Pipe; + +class GenderNumberStatistics { +public: + GenderNumberStatistics() {} + virtual ~GenderNumberStatistics() { Clear(); } + + void Clear() { phrase_counts_.clear(); } + + void Save(FILE *fs) { + bool success; + success = WriteInteger(fs, (int) phrase_counts_.size()); + CHECK(success); + for (std::map, std::vector >::iterator it = + phrase_counts_.begin(); + it != phrase_counts_.end(); + ++it) { + success = WriteIntegerVector(fs, it->first); + CHECK(success); + success = WriteIntegerVector(fs, it->second); + CHECK(success); + } + } + + void Load(FILE *fs) { + bool success; + int length; + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + std::vector phrase; + success = ReadIntegerVector(fs, &phrase); + CHECK(success); + std::vector counts; + success = ReadIntegerVector(fs, &counts); + CHECK(success); + AddPhrase(phrase, counts); + } + } + + bool AddPhrase(const std::vector &phrase, + const std::vector &counts) { + if (phrase_counts_.find(phrase) == phrase_counts_.end()) { + phrase_counts_[phrase] = counts; + return true; + } else { + return false; + } + } + + int ComputeNumber(const std::vector &phrase, int head_index) const; + int ComputeGender(const std::vector &phrase, int head_index) const; + +protected: + std::map, std::vector > phrase_counts_; +}; + +class CoreferenceDictionary : public Dictionary { +public: + CoreferenceDictionary() {} + CoreferenceDictionary(Pipe* pipe) : pipe_(pipe) {} + virtual ~CoreferenceDictionary() { Clear(); } + + void Clear() { + // Don't clear token_dictionary, since this class does not own it. + entity_alphabet_.clear(); + constituent_alphabet_.clear(); + word_alphabet_.clear(); + word_lower_alphabet_.clear(); + unigram_ancestry_alphabet_.clear(); + bigram_ancestry_alphabet_.clear(); + + // TODO(atm): clear all the other stuff!!! + } + + void Save(FILE *fs) { + if (0 > entity_alphabet_.Save(fs)) CHECK(false); + if (0 > constituent_alphabet_.Save(fs)) CHECK(false); + if (0 > word_alphabet_.Save(fs)) CHECK(false); + if (0 > word_lower_alphabet_.Save(fs)) CHECK(false); + if (0 > unigram_ancestry_alphabet_.Save(fs)) CHECK(false); + if (0 > bigram_ancestry_alphabet_.Save(fs)) CHECK(false); + + // Save gender/number statistics. + gender_number_statistics_.Save(fs); + + // Save pronouns. + bool success; + int length = (int)all_pronouns_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::map::iterator it = + all_pronouns_.begin(); + it != all_pronouns_.end(); + ++it) { + int id = it->first; + CoreferencePronoun *pronoun = it->second; + success = WriteInteger(fs, id); + CHECK(success); + pronoun->Save(fs); + } + + // Save determiners. + length = (int)all_determiners_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::map::iterator it = + all_determiners_.begin(); + it != all_determiners_.end(); + ++it) { + int id = it->first; + CoreferenceDeterminer *determiner = it->second; + success = WriteInteger(fs, id); + CHECK(success); + determiner->Save(fs); + } + + // Save various tags. + length = (int)named_entity_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::set::iterator it = named_entity_tags_.begin(); + it != named_entity_tags_.end(); + ++it) { + int id = *it; + success = WriteInteger(fs, id); + CHECK(success); + } + + length = (int)person_entity_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::set::iterator it = person_entity_tags_.begin(); + it != person_entity_tags_.end(); + ++it) { + int id = *it; + success = WriteInteger(fs, id); + CHECK(success); + } + + length = (int)noun_phrase_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::set::iterator it = noun_phrase_tags_.begin(); + it != noun_phrase_tags_.end(); + ++it) { + int id = *it; + success = WriteInteger(fs, id); + CHECK(success); + } + + length = (int)proper_noun_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::set::iterator it = proper_noun_tags_.begin(); + it != proper_noun_tags_.end(); + ++it) { + int id = *it; + success = WriteInteger(fs, id); + CHECK(success); + } + + length = (int)noun_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::set::iterator it = noun_tags_.begin(); + it != noun_tags_.end(); + ++it) { + int id = *it; + success = WriteInteger(fs, id); + CHECK(success); + } + + length = (int)pronominal_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (std::set::iterator it = pronominal_tags_.begin(); + it != pronominal_tags_.end(); + ++it) { + int id = *it; + success = WriteInteger(fs, id); + CHECK(success); + } + } + + void Load(FILE *fs) { + if (0 > entity_alphabet_.Load(fs)) CHECK(false); + if (0 > constituent_alphabet_.Load(fs)) CHECK(false); + if (0 > word_alphabet_.Load(fs)) CHECK(false); + if (0 > word_lower_alphabet_.Load(fs)) CHECK(false); + if (0 > unigram_ancestry_alphabet_.Load(fs)) CHECK(false); + if (0 > bigram_ancestry_alphabet_.Load(fs)) CHECK(false); + entity_alphabet_.BuildNames(); + constituent_alphabet_.BuildNames(); + // TODO(atm): Remove this for memory efficiency. + word_alphabet_.BuildNames(); + word_lower_alphabet_.BuildNames(); + unigram_ancestry_alphabet_.BuildNames(); + bigram_ancestry_alphabet_.BuildNames(); + + // Load gender/number statistics. + gender_number_statistics_.Load(fs); + + // Load pronouns. + bool success; + int length; + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + CoreferencePronoun *pronoun = new CoreferencePronoun; + success = ReadInteger(fs, &id); + CHECK(success); + pronoun->Load(fs); + all_pronouns_[id] = pronoun; + } + + // Load determiners. + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + CoreferenceDeterminer *determiner = new CoreferenceDeterminer; + success = ReadInteger(fs, &id); + CHECK(success); + determiner->Load(fs); + all_determiners_[id] = determiner; + } + + // Load various tags. + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + success = ReadInteger(fs, &id); + CHECK(success); + named_entity_tags_.insert(id); + } + + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + success = ReadInteger(fs, &id); + CHECK(success); + person_entity_tags_.insert(id); + } + + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + success = ReadInteger(fs, &id); + CHECK(success); + noun_phrase_tags_.insert(id); + } + + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + success = ReadInteger(fs, &id); + CHECK(success); + proper_noun_tags_.insert(id); + } + + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + success = ReadInteger(fs, &id); + CHECK(success); + noun_tags_.insert(id); + } + + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int id; + success = ReadInteger(fs, &id); + CHECK(success); + pronominal_tags_.insert(id); + } + } + + void AllowGrowth() { + entity_alphabet_.AllowGrowth(); + constituent_alphabet_.AllowGrowth(); + word_alphabet_.AllowGrowth(); + word_lower_alphabet_.AllowGrowth(); + unigram_ancestry_alphabet_.AllowGrowth(); + bigram_ancestry_alphabet_.AllowGrowth(); + token_dictionary_->AllowGrowth(); + dependency_dictionary_->AllowGrowth(); + semantic_dictionary_->AllowGrowth(); + } + void StopGrowth() { + entity_alphabet_.StopGrowth(); + constituent_alphabet_.StopGrowth(); + word_alphabet_.StopGrowth(); + word_lower_alphabet_.StopGrowth(); + unigram_ancestry_alphabet_.StopGrowth(); + bigram_ancestry_alphabet_.StopGrowth(); + token_dictionary_->StopGrowth(); + dependency_dictionary_->StopGrowth(); + semantic_dictionary_->StopGrowth(); + } + + void CreateEntityDictionary(CoreferenceSentenceReader *reader); + + void CreateConstituentDictionary(CoreferenceSentenceReader *reader); + + void CreateWordDictionaries(CoreferenceSentenceReader *reader); + + void CreateAncestryDictionaries(CoreferenceSentenceReader *reader); + + void BuildEntityNames() { + entity_alphabet_.BuildNames(); + } + + void BuildConstituentNames() { + constituent_alphabet_.BuildNames(); + } + + void BuildWordNames() { + word_alphabet_.BuildNames(); + word_lower_alphabet_.BuildNames(); + } + + void BuildAncestryNames() { + unigram_ancestry_alphabet_.BuildNames(); + bigram_ancestry_alphabet_.BuildNames(); + } + + const string &GetEntityName(int tag) const { + return entity_alphabet_.GetName(tag); + } + + const string &GetConstituentName(int tag) const { + return constituent_alphabet_.GetName(tag); + } + + const string &GetWord(int word) const { + return word_alphabet_.GetName(word); + } + + const string &GetWordLower(int word) const { + return word_lower_alphabet_.GetName(word); + } + + const string &GetUnigramAncestry(int ancestry) const { + return unigram_ancestry_alphabet_.GetName(ancestry); + } + + const string &GetBigramAncestry(int ancestry) const { + return bigram_ancestry_alphabet_.GetName(ancestry); + } + + Pipe *GetPipe() const { return pipe_; } + + TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } + DependencyDictionary *GetDependencyDictionary() const { + return dependency_dictionary_; + } + SemanticDictionary *GetSemanticDictionary() const { + return semantic_dictionary_; + } + void SetTokenDictionary(TokenDictionary *token_dictionary) { + token_dictionary_ = token_dictionary; + } + void SetDependencyDictionary(DependencyDictionary *dependency_dictionary) { + dependency_dictionary_ = dependency_dictionary; + } + void SetSemanticDictionary(SemanticDictionary *semantic_dictionary) { + semantic_dictionary_ = semantic_dictionary; + } + + const Alphabet &GetConstituentAlphabet() const { + return constituent_alphabet_; + }; + + const Alphabet &GetEntityAlphabet() const { + return entity_alphabet_; + }; + + const Alphabet &GetWordAlphabet() const { + return word_alphabet_; + }; + + const Alphabet &GetWordLowerAlphabet() const { + return word_lower_alphabet_; + }; + + const Alphabet &GetUnigramAncestryAlphabet() const { + return unigram_ancestry_alphabet_; + }; + + const Alphabet &GetBigramAncestryAlphabet() const { + return bigram_ancestry_alphabet_; + }; + + const GenderNumberStatistics &GetGenderNumberStatistics() const { + return gender_number_statistics_; + }; + + void ReadGenderNumberStatistics(); + void ReadMentionTags(); + void ReadPronouns(); + void ReadDeterminers(); + + bool IsNamedEntity(int entity_tag) const { + return named_entity_tags_.find(entity_tag) != named_entity_tags_.end(); + } + + bool IsPersonEntity(int entity_tag) const { + return person_entity_tags_.find(entity_tag) != person_entity_tags_.end(); + } + + bool IsNounPhrase(int constituent_tag) const { + return noun_phrase_tags_.find(constituent_tag) != noun_phrase_tags_.end(); + } + + bool IsProperNoun(int pos_tag) const { + return proper_noun_tags_.find(pos_tag) != proper_noun_tags_.end(); + } + + bool IsNoun(int pos_tag) const { + return noun_tags_.find(pos_tag) != noun_tags_.end(); + } + + bool IsPronounTag(int pos_tag) const { + return pronominal_tags_.find(pos_tag) != pronominal_tags_.end(); + } + + bool IsPronoun(int word_lower) const { + std::map::const_iterator it = + all_pronouns_.find(word_lower); + return it != all_pronouns_.end(); + } + + CoreferencePronoun *GetPronoun(int word_lower) const { + std::map::const_iterator it = + all_pronouns_.find(word_lower); + if (it == all_pronouns_.end()) return NULL; + return it->second; + } + + bool IsMalePronoun(int word_lower) const { + CoreferencePronoun *pronoun = GetPronoun(word_lower); + if (!pronoun) return false; + return pronoun->IsGenderMale(); + } + + bool IsFemalePronoun(int word_lower) const { + CoreferencePronoun *pronoun = GetPronoun(word_lower); + if (!pronoun) return false; + return pronoun->IsGenderFemale(); + } + + bool IsNeutralPronoun(int word_lower) const { + CoreferencePronoun *pronoun = GetPronoun(word_lower); + if (!pronoun) return false; + return pronoun->IsGenderNeutral(); + } + + bool IsSingularPronoun(int word_lower) const { + CoreferencePronoun *pronoun = GetPronoun(word_lower); + if (!pronoun) return false; + return pronoun->IsNumberSingular(); + } + + bool IsPluralPronoun(int word_lower) const { + CoreferencePronoun *pronoun = GetPronoun(word_lower); + if (!pronoun) return false; + return pronoun->IsNumberPlural(); + } + + bool IsDeterminer(int word_lower) const { + std::map::const_iterator it = + all_determiners_.find(word_lower); + return it != all_determiners_.end(); + } + + CoreferenceDeterminer *GetDeterminer(int word_lower) const { + std::map::const_iterator it = + all_determiners_.find(word_lower); + if (it == all_determiners_.end()) return NULL; + return it->second; + } + + bool IsMaleDeterminer(int word_lower) const { + CoreferenceDeterminer *determiner = GetDeterminer(word_lower); + if (!determiner) return false; + return determiner->IsGenderMale(); + } + + bool IsFemaleDeterminer(int word_lower) const { + CoreferenceDeterminer *determiner = GetDeterminer(word_lower); + if (!determiner) return false; + return determiner->IsGenderFemale(); + } + + bool IsNeutralDeterminer(int word_lower) const { + CoreferenceDeterminer *determiner = GetDeterminer(word_lower); + if (!determiner) return false; + return determiner->IsGenderNeutral(); + } + + bool IsSingularDeterminer(int word_lower) const { + CoreferenceDeterminer *determiner = GetDeterminer(word_lower); + if (!determiner) return false; + return determiner->IsNumberSingular(); + } + + bool IsPluralDeterminer(int word_lower) const { + CoreferenceDeterminer *determiner = GetDeterminer(word_lower); + if (!determiner) return false; + return determiner->IsNumberPlural(); + } + + // TODO(atm): this should not be here, but let us keep it for now... + void ComputeDependencyAncestryStrings( + DependencyInstance *instance, + int i, + std::string *unigram_ancestry_string, + std::string *bigram_ancestry_string) const; + +protected: + void DeleteAllPronouns() { + for (std::map::iterator it = + all_pronouns_.begin(); + it != all_pronouns_.end(); + ++it) { + delete it->second; + } + all_pronouns_.clear(); + } + + void DeleteAllDeterminers() { + for (std::map::iterator it = + all_determiners_.begin(); + it != all_determiners_.end(); + ++it) { + delete it->second; + } + all_determiners_.clear(); + } + +protected: + Pipe *pipe_; + TokenDictionary *token_dictionary_; + DependencyDictionary *dependency_dictionary_; + SemanticDictionary *semantic_dictionary_; + Alphabet entity_alphabet_; + Alphabet constituent_alphabet_; + // The two form alphabets below come in addition to the TokenDictionary's + // form alphabet. We have these additional alphabets here since we do not want + // a cutoff and we want to allow loading a lexicon (for gender/number + // computation). + Alphabet word_alphabet_; + Alphabet word_lower_alphabet_; + Alphabet unigram_ancestry_alphabet_; + Alphabet bigram_ancestry_alphabet_; + GenderNumberStatistics gender_number_statistics_; + std::map all_pronouns_; + std::map all_determiners_; + std::set named_entity_tags_; + std::set person_entity_tags_; + std::set noun_tags_; + std::set noun_phrase_tags_; + std::set proper_noun_tags_; + std::set pronominal_tags_; + //Alphabet tag_alphabet_; +}; + +#endif /* COREFERENCEDICTIONARY_H_ */ diff --git a/src/coreference_resolver/CoreferenceDocument.h b/src/coreference_resolver/CoreferenceDocument.h index 9c0b078..25339f1 100644 --- a/src/coreference_resolver/CoreferenceDocument.h +++ b/src/coreference_resolver/CoreferenceDocument.h @@ -1,63 +1,63 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef COREFERENCEDOCUMENT_H_ -#define COREFERENCEDOCUMENT_H_ - -#include -#include -#include "CoreferenceSentence.h" - -class CoreferenceDocument : public Instance { -public: - CoreferenceDocument() { conversation_ = false; } - virtual ~CoreferenceDocument() { DeleteAllSentences(); } - - Instance* Copy() { - CoreferenceDocument* document = new CoreferenceDocument(); - document->Initialize(name_, part_number_, sentences_); - return static_cast(document); - } - - void Initialize(const std::string &name, - int part_number, - const std::vector &sentences); - - int GetNumSentences() { return sentences_.size(); } - CoreferenceSentence *GetSentence(int i) { return sentences_[i]; } - - const std::string &name() { return name_; } - int part_number() { return part_number_; } - bool is_conversation() { return conversation_; } - -protected: - void DeleteAllSentences(); - -protected: - // Document name and part number. - std::string name_; - int part_number_; - - // True if conversation. - bool conversation_; - - // List of sentences composing this document. - std::vector sentences_; -}; - -#endif /* COREFERENCEDOCUMENT_H_*/ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef COREFERENCEDOCUMENT_H_ +#define COREFERENCEDOCUMENT_H_ + +#include +#include +#include "CoreferenceSentence.h" + +class CoreferenceDocument : public Instance { +public: + CoreferenceDocument() { conversation_ = false; } + virtual ~CoreferenceDocument() { DeleteAllSentences(); } + + Instance* Copy() { + CoreferenceDocument* document = new CoreferenceDocument(); + document->Initialize(name_, part_number_, sentences_); + return static_cast(document); + } + + void Initialize(const std::string &name, + int part_number, + const std::vector &sentences); + + int GetNumSentences() { return (int)sentences_.size(); } + CoreferenceSentence *GetSentence(int i) { return sentences_[i]; } + + const std::string &name() { return name_; } + int part_number() { return part_number_; } + bool is_conversation() { return conversation_; } + +protected: + void DeleteAllSentences(); + +protected: + // Document name and part number. + std::string name_; + int part_number_; + + // True if conversation. + bool conversation_; + + // List of sentences composing this document. + std::vector sentences_; +}; + +#endif /* COREFERENCEDOCUMENT_H_*/ diff --git a/src/coreference_resolver/CoreferenceDocumentNumeric.h b/src/coreference_resolver/CoreferenceDocumentNumeric.h index 44f10c0..f7e434c 100644 --- a/src/coreference_resolver/CoreferenceDocumentNumeric.h +++ b/src/coreference_resolver/CoreferenceDocumentNumeric.h @@ -1,128 +1,128 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef COREFERENCEDOCUMENTNUMERIC_H_ -#define COREFERENCEDOCUMENTNUMERIC_H_ - -#include "CoreferenceSentenceNumeric.h" -#include "CoreferenceDocument.h" -#include "CoreferenceDictionary.h" - -class CoreferenceDocumentNumeric : public Instance { -public: - CoreferenceDocumentNumeric() { conversation_ = false; }; - virtual ~CoreferenceDocumentNumeric() { Clear(); }; - - Instance* Copy() { - CHECK(false) << "Not implemented."; - return NULL; - } - - void Clear() { - sentence_cumulative_lengths_.clear(); - for (int i = 0; i < coreference_spans_.size(); ++i) { - delete coreference_spans_[i]; - } - coreference_spans_.clear(); - // TODO(atm): if document owns mentions, they should be deleted here. - mentions_.clear(); - entity_clusters_.clear(); - DeleteAllSentences(); - } - - void Initialize(const CoreferenceDictionary &dictionary, - CoreferenceDocument *instance, - bool add_gold_mentions); - - // True if document is a conversation. - bool is_conversation() { return conversation_; } - - // Returns the number of sentences in the document. - int GetNumSentences() { return sentences_.size(); } - - // Returns the number of words in the document. - int GetNumWords() { return sentence_cumulative_lengths_.back(); } - - // Returns the i-th sentence. - CoreferenceSentenceNumeric *GetSentence(int i) { return sentences_[i]; } - - // Returns the mentions. - const std::vector &GetMentions() { return mentions_; } - - // Returns the mentions. - const std::vector > &GetEntityClusters() { - return entity_clusters_; - } - - bool IsMentionAnaphoric(int j) { - return mentions_[j]->id() >= 0 && - entity_clusters_[mentions_[j]->id()][0] != j; - } - - // Get the sentence to which a word belongs. - // Note: this takes linear time w.r.t. the number of sentences. - void FindSentencePosition(int word_document_index, int *sentence_index, - int *word_sentence_index) { - *sentence_index = -1; - for (int i = 0; i < sentence_cumulative_lengths_.size(); ++i) { - ++(*sentence_index); - if (word_document_index < sentence_cumulative_lengths_[i]) break; - } - if (*sentence_index > 0) { - *word_sentence_index = - word_document_index - sentence_cumulative_lengths_[*sentence_index - 1]; - } else { - *word_sentence_index = word_document_index; - } - // Increment the word index since sentences have a start symbol. - ++(*word_sentence_index); - } - - // Return the global word position at document level. - int GetDocumentPosition(int sentence_index, int word_sentence_index) { - // Subtract 1 since sentences have a start symbol. - if (sentence_index == 0) { - return word_sentence_index - 1; - } else { - return word_sentence_index - 1 + - sentence_cumulative_lengths_[sentence_index - 1]; - } - } - -protected: - void DeleteAllSentences() { - for (int i = 0; i < sentences_.size(); ++i) { - delete sentences_[i]; - } - sentences_.clear(); - } - - void ComputeEntityClusters(); - - void ComputeGlobalWordPositions(CoreferenceDocument* instance); - -private: - bool conversation_; - std::vector sentences_; - std::vector sentence_cumulative_lengths_; - std::vector coreference_spans_; - std::vector mentions_; - std::vector > entity_clusters_; -}; - -#endif /* COREFERENCEDOCUMENTNUMERIC_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef COREFERENCEDOCUMENTNUMERIC_H_ +#define COREFERENCEDOCUMENTNUMERIC_H_ + +#include "CoreferenceSentenceNumeric.h" +#include "CoreferenceDocument.h" +#include "CoreferenceDictionary.h" + +class CoreferenceDocumentNumeric : public Instance { +public: + CoreferenceDocumentNumeric() { conversation_ = false; }; + virtual ~CoreferenceDocumentNumeric() { Clear(); }; + + Instance* Copy() { + CHECK(false) << "Not implemented."; + return NULL; + } + + void Clear() { + sentence_cumulative_lengths_.clear(); + for (int i = 0; i < coreference_spans_.size(); ++i) { + delete coreference_spans_[i]; + } + coreference_spans_.clear(); + // TODO(atm): if document owns mentions, they should be deleted here. + mentions_.clear(); + entity_clusters_.clear(); + DeleteAllSentences(); + } + + void Initialize(const CoreferenceDictionary &dictionary, + CoreferenceDocument *instance, + bool add_gold_mentions); + + // True if document is a conversation. + bool is_conversation() { return conversation_; } + + // Returns the number of sentences in the document. + int GetNumSentences() { return (int)sentences_.size(); } + + // Returns the number of words in the document. + int GetNumWords() { return sentence_cumulative_lengths_.back(); } + + // Returns the i-th sentence. + CoreferenceSentenceNumeric *GetSentence(int i) { return sentences_[i]; } + + // Returns the mentions. + const std::vector &GetMentions() { return mentions_; } + + // Returns the mentions. + const std::vector > &GetEntityClusters() { + return entity_clusters_; + } + + bool IsMentionAnaphoric(int j) { + return mentions_[j]->id() >= 0 && + entity_clusters_[mentions_[j]->id()][0] != j; + } + + // Get the sentence to which a word belongs. + // Note: this takes linear time w.r.t. the number of sentences. + void FindSentencePosition(int word_document_index, int *sentence_index, + int *word_sentence_index) { + *sentence_index = -1; + for (int i = 0; i < sentence_cumulative_lengths_.size(); ++i) { + ++(*sentence_index); + if (word_document_index < sentence_cumulative_lengths_[i]) break; + } + if (*sentence_index > 0) { + *word_sentence_index = + word_document_index - sentence_cumulative_lengths_[*sentence_index - 1]; + } else { + *word_sentence_index = word_document_index; + } + // Increment the word index since sentences have a start symbol. + ++(*word_sentence_index); + } + + // Return the global word position at document level. + int GetDocumentPosition(int sentence_index, int word_sentence_index) { + // Subtract 1 since sentences have a start symbol. + if (sentence_index == 0) { + return word_sentence_index - 1; + } else { + return word_sentence_index - 1 + + sentence_cumulative_lengths_[sentence_index - 1]; + } + } + +protected: + void DeleteAllSentences() { + for (int i = 0; i < sentences_.size(); ++i) { + delete sentences_[i]; + } + sentences_.clear(); + } + + void ComputeEntityClusters(); + + void ComputeGlobalWordPositions(CoreferenceDocument* instance); + +private: + bool conversation_; + std::vector sentences_; + std::vector sentence_cumulative_lengths_; + std::vector coreference_spans_; + std::vector mentions_; + std::vector > entity_clusters_; +}; + +#endif /* COREFERENCEDOCUMENTNUMERIC_H_ */ diff --git a/src/coreference_resolver/CoreferenceFeatures.h b/src/coreference_resolver/CoreferenceFeatures.h index 4a2d53c..d9f5944 100644 --- a/src/coreference_resolver/CoreferenceFeatures.h +++ b/src/coreference_resolver/CoreferenceFeatures.h @@ -1,81 +1,81 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef COREFERENCEFEATURES_H_ -#define COREFERENCEFEATURES_H_ - -#include "Features.h" -#include "CoreferenceDocumentNumeric.h" -#include "FeatureEncoder.h" - -class CoreferenceOptions; - -class CoreferenceFeatures : public Features { -public: - CoreferenceFeatures() {}; - CoreferenceFeatures(Pipe* pipe) { pipe_ = pipe; } - virtual ~CoreferenceFeatures() { Clear(); } - -public: - void Clear() { - for (int r = 0; r < input_features_.size(); ++r) { - if (!input_features_[r]) continue; - input_features_[r]->clear(); - delete input_features_[r]; - input_features_[r] = NULL; - } - input_features_.clear(); - } - - void Initialize(Instance *instance, Parts *parts) { - Clear(); - input_features_.resize(parts->size(), static_cast(NULL)); - } - - int GetNumPartFeatures(int r) const { - return (NULL == input_features_[r]) ? 0 : input_features_[r]->size(); - }; - - int GetPartFeature(int r, int j) const { - return (*input_features_[r])[j]; - } - - const BinaryFeatures &GetPartFeatures(int r) const { - return *(input_features_[r]); - }; - - BinaryFeatures *GetMutablePartFeatures(int r) const { - return input_features_[r]; - }; - -public: - void AddArcFeatures(CoreferenceDocumentNumeric *document, - int r, - int parent_mention, - int child_mention); - - void AddFeature(uint64_t fkey, BinaryFeatures* features) { - features->push_back(fkey); - } - -protected: - vector input_features_; // Vector of input features. - FeatureEncoder encoder_; // Encoder that converts features into a codeword. -}; - -#endif /* COREFERENCEFEATURES_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef COREFERENCEFEATURES_H_ +#define COREFERENCEFEATURES_H_ + +#include "Features.h" +#include "CoreferenceDocumentNumeric.h" +#include "FeatureEncoder.h" + +class CoreferenceOptions; + +class CoreferenceFeatures : public Features { +public: + CoreferenceFeatures() {}; + CoreferenceFeatures(Pipe* pipe) { pipe_ = pipe; } + virtual ~CoreferenceFeatures() { Clear(); } + +public: + void Clear() { + for (int r = 0; r < input_features_.size(); ++r) { + if (!input_features_[r]) continue; + input_features_[r]->clear(); + delete input_features_[r]; + input_features_[r] = NULL; + } + input_features_.clear(); + } + + void Initialize(Instance *instance, Parts *parts) { + Clear(); + input_features_.resize(parts->size(), static_cast(NULL)); + } + + int GetNumPartFeatures(int r) const { + return (NULL == input_features_[r]) ? 0 : (int)(input_features_[r]->size()); + }; + + int GetPartFeature(int r, int j) const { + return (*input_features_[r])[j]; + } + + const BinaryFeatures &GetPartFeatures(int r) const { + return *(input_features_[r]); + }; + + BinaryFeatures *GetMutablePartFeatures(int r) const { + return input_features_[r]; + }; + +public: + void AddArcFeatures(CoreferenceDocumentNumeric *document, + int r, + int parent_mention, + int child_mention); + + void AddFeature(uint64_t fkey, BinaryFeatures* features) { + features->push_back(fkey); + } + +protected: + vector input_features_; // Vector of input features. + FeatureEncoder encoder_; // Encoder that converts features into a codeword. +}; + +#endif /* COREFERENCEFEATURES_H_ */ diff --git a/src/coreference_resolver/CoreferencePart.cpp b/src/coreference_resolver/CoreferencePart.cpp index 3cc388b..4f461ea 100644 --- a/src/coreference_resolver/CoreferencePart.cpp +++ b/src/coreference_resolver/CoreferencePart.cpp @@ -1,55 +1,55 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "CoreferencePart.h" - -void CoreferenceParts::DeleteAll() { - DeleteIndices(); - - for (iterator iter = begin(); iter != end(); iter++) { - if ((*iter) != NULL) { - delete (*iter); - *iter = NULL; - } - } - - clear(); -} - -void CoreferenceParts::DeleteIndices() { - for (int i = 0; i < index_.size(); ++i) { - index_[i].clear(); - } - index_.clear(); -} - -void CoreferenceParts::BuildIndices(int num_mentions) { - DeleteIndices(); - index_.resize(num_mentions); - - int num_arcs = size(); - int offset = 0; - for (int r = 0; r < num_arcs; ++r) { - Part *part = (*this)[offset + r]; - CHECK(part->type() == COREFERENCEPART_ARC); - int i = static_cast(part)->child_mention(); - CHECK_GE(i, 0); - CHECK_LT(i, num_mentions); - index_[i].push_back(offset + r); - } -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "CoreferencePart.h" + +void CoreferenceParts::DeleteAll() { + DeleteIndices(); + + for (iterator iter = begin(); iter != end(); iter++) { + if ((*iter) != NULL) { + delete (*iter); + *iter = NULL; + } + } + + clear(); +} + +void CoreferenceParts::DeleteIndices() { + for (int i = 0; i < index_.size(); ++i) { + index_[i].clear(); + } + index_.clear(); +} + +void CoreferenceParts::BuildIndices(int num_mentions) { + DeleteIndices(); + index_.resize(num_mentions); + + int num_arcs = (int)size(); + int offset = 0; + for (int r = 0; r < num_arcs; ++r) { + Part *part = (*this)[offset + r]; + CHECK(part->type() == COREFERENCEPART_ARC); + int i = static_cast(part)->child_mention(); + CHECK_GE(i, 0); + CHECK_LT(i, num_mentions); + index_[i].push_back(offset + r); + } +} diff --git a/src/coreference_resolver/CoreferencePipe.cpp b/src/coreference_resolver/CoreferencePipe.cpp index b966836..b5e995f 100644 --- a/src/coreference_resolver/CoreferencePipe.cpp +++ b/src/coreference_resolver/CoreferencePipe.cpp @@ -1,311 +1,309 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "CoreferencePipe.h" -#include "logval.h" -#include -#include -#include - -// Define the current model version and the oldest back-compatible version. -// The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0". -const uint64_t kCoreferenceModelVersion = 200040000; -const uint64_t kOldestCompatibleCoreferenceModelVersion = 200040000; -const uint64_t kCoreferenceModelCheck = 1234567890; - -void CoreferencePipe::SaveModel(FILE* fs) { - bool success; - success = WriteUINT64(fs, kCoreferenceModelCheck); - CHECK(success); - success = WriteUINT64(fs, kCoreferenceModelVersion); - CHECK(success); - token_dictionary_->Save(fs); - dependency_dictionary_->Save(fs); - semantic_dictionary_->Save(fs); - Pipe::SaveModel(fs); -} - -void CoreferencePipe::LoadModel(FILE* fs) { - bool success; - uint64_t model_check; - uint64_t model_version; - success = ReadUINT64(fs, &model_check); - CHECK(success); - CHECK_EQ(model_check, kCoreferenceModelCheck) - << "The model file is too old and not supported anymore."; - success = ReadUINT64(fs, &model_version); - CHECK(success); - CHECK_GE(model_version, kOldestCompatibleCoreferenceModelVersion) - << "The model file is too old and not supported anymore."; - - delete token_dictionary_; - CreateTokenDictionary(); - token_dictionary_->Load(fs); - - delete dependency_dictionary_; - CreateDependencyDictionary(); - dependency_dictionary_->SetTokenDictionary(token_dictionary_); - dependency_dictionary_->Load(fs); - - delete semantic_dictionary_; - CreateSemanticDictionary(); - semantic_dictionary_->SetTokenDictionary(token_dictionary_); - semantic_dictionary_->SetDependencyDictionary(dependency_dictionary_); - semantic_dictionary_->Load(fs); - - GetCoreferenceDictionary()->SetTokenDictionary(token_dictionary_); - GetCoreferenceDictionary()->SetDependencyDictionary(dependency_dictionary_); - GetCoreferenceDictionary()->SetSemanticDictionary(semantic_dictionary_); - - Pipe::LoadModel(fs); -} - -void CoreferencePipe::PreprocessData() { - delete token_dictionary_; - CreateTokenDictionary(); - static_cast(token_dictionary_)->Initialize(GetCoreferenceSentenceReader()); - - delete dependency_dictionary_; - CreateDependencyDictionary(); - dependency_dictionary_->SetTokenDictionary(token_dictionary_); - dependency_dictionary_->CreateLabelDictionary(GetCoreferenceSentenceReader()); - - delete semantic_dictionary_; - CreateSemanticDictionary(); - semantic_dictionary_->SetTokenDictionary(token_dictionary_); - semantic_dictionary_->SetDependencyDictionary(dependency_dictionary_); - semantic_dictionary_-> - CreatePredicateRoleDictionaries(GetCoreferenceSentenceReader()); - - GetCoreferenceDictionary()->SetTokenDictionary(token_dictionary_); - GetCoreferenceDictionary()->SetDependencyDictionary(dependency_dictionary_); - GetCoreferenceDictionary()->SetSemanticDictionary(semantic_dictionary_); - GetCoreferenceDictionary()-> - CreateEntityDictionary(GetCoreferenceSentenceReader()); - GetCoreferenceDictionary()-> - CreateConstituentDictionary(GetCoreferenceSentenceReader()); - GetCoreferenceDictionary()-> - CreateWordDictionaries(GetCoreferenceSentenceReader()); - GetCoreferenceDictionary()-> - CreateAncestryDictionaries(GetCoreferenceSentenceReader()); - - GetCoreferenceDictionary()->ReadMentionTags(); - GetCoreferenceDictionary()->ReadPronouns(); - GetCoreferenceDictionary()->ReadDeterminers(); - GetCoreferenceDictionary()->ReadGenderNumberStatistics(); -} - -void CoreferencePipe::ComputeScores(Instance *instance, Parts *parts, - Features *features, - std::vector *scores) { - Pipe::ComputeScores(instance, parts, features, scores); -} - -void CoreferencePipe::MakeGradientStep( - Parts *parts, - Features *features, - double eta, - int iteration, - const std::vector &gold_output, - const std::vector &predicted_output) { - Pipe::MakeGradientStep(parts, features, eta, iteration, gold_output, - predicted_output); -} - -void CoreferencePipe::MakeFeatureDifference( - Parts *parts, - Features *features, - const std::vector &gold_output, - const std::vector &predicted_output, - FeatureVector *difference) { - Pipe::MakeFeatureDifference(parts, features, gold_output, predicted_output, - difference); -} - -void CoreferencePipe::TransformGold(Instance *instance, - Parts *parts, - const std::vector &scores, - std::vector *gold_output, - double *loss_inner) { - CoreferenceOptions *options = static_cast(options_); - if (options->train_with_closest_antecedent()) { - *loss_inner = 0.0; - } else { - double log_partition_function_inner; - double entropy_inner; - std::vector copied_scores = scores; - for (int r = 0; r < parts->size(); ++r) { - if ((*gold_output)[r] < 0.5) { - copied_scores[r] = -std::numeric_limits::infinity(); - } else { - CoreferencePartArc *arc = static_cast((*parts)[r]); - //LOG(INFO) << "Part[" << arc->parent_mention() << ", " - // << arc->child_mention() << "] is gold."; - } - } - static_cast(decoder_)-> - DecodeBasicMarginals(instance, parts, copied_scores, gold_output, - &log_partition_function_inner, &entropy_inner); - *loss_inner = entropy_inner; - } -} - -void CoreferencePipe::MakeParts(Instance *instance, - Parts *parts, - std::vector *gold_outputs) { - CoreferenceDocumentNumeric *document = - static_cast(instance); - CoreferenceOptions *options = static_cast(options_); - - CoreferenceParts *coreference_parts = static_cast(parts); - coreference_parts->Initialize(); - bool make_gold = (gold_outputs != NULL); - if (make_gold) gold_outputs->clear(); - - const std::vector &mentions = document->GetMentions(); - //std::set entities; - - // Create arc parts departing from the artifical root (non-anaphoric - // mentions). - for (int j = 0; j < mentions.size(); ++j) { - Part *part = coreference_parts->CreatePartArc(-1, j); - coreference_parts->push_back(part); - if (make_gold) { - if (!document->IsMentionAnaphoric(j)) { - gold_outputs->push_back(1.0); - } else { - gold_outputs->push_back(0.0); - } - } - } - - // Create arc parts involving two mentions. - int mention_distance_threshold = -1; //100; // TODO(atm): put this in the options. - for (int j = 0; j < mentions.size(); ++j) { - bool found_closest = false; - for (int k = j + 1; k < mentions.size(); ++k) { - if (mention_distance_threshold >= 0 && - k - j > mention_distance_threshold && - !(make_gold && (mentions[j]->id() >= 0 && - mentions[j]->id() == mentions[k]->id()))) { - continue; - } - Part *part = coreference_parts->CreatePartArc(j, k); - coreference_parts->push_back(part); - if (make_gold) { - if (mentions[j]->id() >= 0 && mentions[j]->id() == mentions[k]->id()) { - //LOG(INFO) << "Found coreferent mentions: " << j << ", " << k; - if (!options->train_with_closest_antecedent() || !found_closest) { - gold_outputs->push_back(1.0); - found_closest = true; - } else { - gold_outputs->push_back(0.0); - } - } else { - gold_outputs->push_back(0.0); - } - } - } - } - - coreference_parts->BuildIndices(mentions.size()); - // Necessary to store this information here for LabelInstance at test time. - coreference_parts->SetMentions(mentions); -} - -void CoreferencePipe::MakeSelectedFeatures( - Instance *instance, - Parts *parts, - const std::vector &selected_parts, - Features *features) { - CoreferenceDocumentNumeric *document = - static_cast(instance); - CoreferenceFeatures *coreference_features = - static_cast(features); - - CoreferenceParts *coreference_parts = static_cast(parts); - const std::vector &mentions = document->GetMentions(); - - coreference_features->Initialize(instance, parts); - - // Build features for coreference arcs. - for (int r = 0; r < coreference_parts->size(); ++r) { - CoreferencePartArc *arc = - static_cast((*coreference_parts)[r]); - coreference_features->AddArcFeatures(document, r, arc->parent_mention(), - arc->child_mention()); - } -} - -void CoreferencePipe::LabelInstance(Parts *parts, - const std::vector &output, - Instance *instance) { - CoreferenceDocument *document = - static_cast(instance); - CoreferenceParts *coreference_parts = static_cast(parts); - - const std::vector &mentions = coreference_parts->GetMentions(); - std::vector mention_clusters(mentions.size(), -1); - std::vector > entities; - - double threshold = 0.5; - for (int r = 0; r < coreference_parts->size(); ++r) { - if (output[r] < threshold) continue; - CoreferencePartArc *arc = static_cast((*parts)[r]); - CHECK_EQ(mention_clusters[arc->child_mention()], -1); - if (arc->parent_mention() < 0) { - // Non-anaphoric mention; create its own cluster. - mention_clusters[arc->child_mention()] = entities.size(); - entities.push_back(std::vector(1, arc->child_mention())); - } else { - int k = mention_clusters[arc->parent_mention()]; - mention_clusters[arc->child_mention()] = k; - entities[k].push_back(arc->child_mention()); - } - } - - // Clear gold coreference spans, if any. - for (int i = 0; i < document->GetNumSentences(); ++i) { - CoreferenceSentence *sentence = document->GetSentence(i); - sentence->ClearCoreferenceSpans(); - } - - // Add predicted coreference spans. - int num_entities = 0; - for (int k = 0; k < entities.size(); ++k) { - if (entities[k].size() > 1) { - ++num_entities; - } - } - for (int j = 0; j < mentions.size(); ++j) { - int k = mention_clusters[j]; - if (entities[k].size() > 1) { - // Not a singleton cluster; add coreference span. - std::ostringstream ss; - ss << k; - const std::string &name(ss.str()); - NamedSpan span(mentions[j]->start(), mentions[j]->end(), name); - int i = mentions[j]->sentence_index(); - CoreferenceSentence *sentence = document->GetSentence(i); - sentence->AddCoreferenceSpan(span); - } - } - - LOG(INFO) << "Predicted " << num_entities << " entities for " << mentions.size() - << " mentions."; -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "CoreferencePipe.h" +#include "logval.h" +#include +#include +#include + +// Define the current model version and the oldest back-compatible version. +// The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0". +const uint64_t kCoreferenceModelVersion = 200040000; +const uint64_t kOldestCompatibleCoreferenceModelVersion = 200040000; +const uint64_t kCoreferenceModelCheck = 1234567890; + +void CoreferencePipe::SaveModel(FILE* fs) { + bool success; + success = WriteUINT64(fs, kCoreferenceModelCheck); + CHECK(success); + success = WriteUINT64(fs, kCoreferenceModelVersion); + CHECK(success); + token_dictionary_->Save(fs); + dependency_dictionary_->Save(fs); + semantic_dictionary_->Save(fs); + Pipe::SaveModel(fs); +} + +void CoreferencePipe::LoadModel(FILE* fs) { + bool success; + success = ReadUINT64(fs, &model_check_); + CHECK(success); + CHECK_EQ(model_check_, kCoreferenceModelCheck) + << "The model file is too old and not supported anymore."; + success = ReadUINT64(fs, &model_version_); + CHECK(success); + CHECK_GE(model_version_, kOldestCompatibleCoreferenceModelVersion) + << "The model file is too old and not supported anymore."; + + delete token_dictionary_; + CreateTokenDictionary(); + token_dictionary_->Load(fs); + + delete dependency_dictionary_; + CreateDependencyDictionary(); + dependency_dictionary_->SetTokenDictionary(token_dictionary_); + dependency_dictionary_->Load(fs); + + delete semantic_dictionary_; + CreateSemanticDictionary(); + semantic_dictionary_->SetTokenDictionary(token_dictionary_); + semantic_dictionary_->SetDependencyDictionary(dependency_dictionary_); + semantic_dictionary_->Load(fs); + + GetCoreferenceDictionary()->SetTokenDictionary(token_dictionary_); + GetCoreferenceDictionary()->SetDependencyDictionary(dependency_dictionary_); + GetCoreferenceDictionary()->SetSemanticDictionary(semantic_dictionary_); + + Pipe::LoadModel(fs); +} + +void CoreferencePipe::PreprocessData() { + delete token_dictionary_; + CreateTokenDictionary(); + static_cast(token_dictionary_)->Initialize(GetCoreferenceSentenceReader()); + + delete dependency_dictionary_; + CreateDependencyDictionary(); + dependency_dictionary_->SetTokenDictionary(token_dictionary_); + dependency_dictionary_->CreateLabelDictionary(GetCoreferenceSentenceReader()); + + delete semantic_dictionary_; + CreateSemanticDictionary(); + semantic_dictionary_->SetTokenDictionary(token_dictionary_); + semantic_dictionary_->SetDependencyDictionary(dependency_dictionary_); + semantic_dictionary_-> + CreatePredicateRoleDictionaries(GetCoreferenceSentenceReader()); + + GetCoreferenceDictionary()->SetTokenDictionary(token_dictionary_); + GetCoreferenceDictionary()->SetDependencyDictionary(dependency_dictionary_); + GetCoreferenceDictionary()->SetSemanticDictionary(semantic_dictionary_); + GetCoreferenceDictionary()-> + CreateEntityDictionary(GetCoreferenceSentenceReader()); + GetCoreferenceDictionary()-> + CreateConstituentDictionary(GetCoreferenceSentenceReader()); + GetCoreferenceDictionary()-> + CreateWordDictionaries(GetCoreferenceSentenceReader()); + GetCoreferenceDictionary()-> + CreateAncestryDictionaries(GetCoreferenceSentenceReader()); + + GetCoreferenceDictionary()->ReadMentionTags(); + GetCoreferenceDictionary()->ReadPronouns(); + GetCoreferenceDictionary()->ReadDeterminers(); + GetCoreferenceDictionary()->ReadGenderNumberStatistics(); +} + +void CoreferencePipe::ComputeScores(Instance *instance, Parts *parts, + Features *features, + std::vector *scores) { + Pipe::ComputeScores(instance, parts, features, scores); +} + +void CoreferencePipe::MakeGradientStep( + Parts *parts, + Features *features, + double eta, + int iteration, + const std::vector &gold_output, + const std::vector &predicted_output) { + Pipe::MakeGradientStep(parts, features, eta, iteration, gold_output, + predicted_output); +} + +void CoreferencePipe::MakeFeatureDifference( + Parts *parts, + Features *features, + const std::vector &gold_output, + const std::vector &predicted_output, + FeatureVector *difference) { + Pipe::MakeFeatureDifference(parts, features, gold_output, predicted_output, + difference); +} + +void CoreferencePipe::TransformGold(Instance *instance, + Parts *parts, + const std::vector &scores, + std::vector *gold_output, + double *loss_inner) { + CoreferenceOptions *options = static_cast(options_); + if (options->train_with_closest_antecedent()) { + *loss_inner = 0.0; + } else { + double log_partition_function_inner; + double entropy_inner; + std::vector copied_scores = scores; + for (int r = 0; r < parts->size(); ++r) { + if ((*gold_output)[r] < 0.5) { + copied_scores[r] = -std::numeric_limits::infinity(); + } else { + CoreferencePartArc *arc = static_cast((*parts)[r]); + //LOG(INFO) << "Part[" << arc->parent_mention() << ", " + // << arc->child_mention() << "] is gold."; + } + } + static_cast(decoder_)-> + DecodeBasicMarginals(instance, parts, copied_scores, gold_output, + &log_partition_function_inner, &entropy_inner); + *loss_inner = entropy_inner; + } +} + +void CoreferencePipe::MakeParts(Instance *instance, + Parts *parts, + std::vector *gold_outputs) { + CoreferenceDocumentNumeric *document = + static_cast(instance); + CoreferenceOptions *options = static_cast(options_); + + CoreferenceParts *coreference_parts = static_cast(parts); + coreference_parts->Initialize(); + bool make_gold = (gold_outputs != NULL); + if (make_gold) gold_outputs->clear(); + + const std::vector &mentions = document->GetMentions(); + //std::set entities; + + // Create arc parts departing from the artifical root (non-anaphoric + // mentions). + for (int j = 0; j < mentions.size(); ++j) { + Part *part = coreference_parts->CreatePartArc(-1, j); + coreference_parts->push_back(part); + if (make_gold) { + if (!document->IsMentionAnaphoric(j)) { + gold_outputs->push_back(1.0); + } else { + gold_outputs->push_back(0.0); + } + } + } + + // Create arc parts involving two mentions. + int mention_distance_threshold = -1; //100; // TODO(atm): put this in the options. + for (int j = 0; j < mentions.size(); ++j) { + bool found_closest = false; + for (int k = j + 1; k < mentions.size(); ++k) { + if (mention_distance_threshold >= 0 && + k - j > mention_distance_threshold && + !(make_gold && (mentions[j]->id() >= 0 && + mentions[j]->id() == mentions[k]->id()))) { + continue; + } + Part *part = coreference_parts->CreatePartArc(j, k); + coreference_parts->push_back(part); + if (make_gold) { + if (mentions[j]->id() >= 0 && mentions[j]->id() == mentions[k]->id()) { + //LOG(INFO) << "Found coreferent mentions: " << j << ", " << k; + if (!options->train_with_closest_antecedent() || !found_closest) { + gold_outputs->push_back(1.0); + found_closest = true; + } else { + gold_outputs->push_back(0.0); + } + } else { + gold_outputs->push_back(0.0); + } + } + } + } + + coreference_parts->BuildIndices((int)mentions.size()); + // Necessary to store this information here for LabelInstance at test time. + coreference_parts->SetMentions(mentions); +} + +void CoreferencePipe::MakeSelectedFeatures( + Instance *instance, + Parts *parts, + const std::vector &selected_parts, + Features *features) { + CoreferenceDocumentNumeric *document = + static_cast(instance); + CoreferenceFeatures *coreference_features = + static_cast(features); + + CoreferenceParts *coreference_parts = static_cast(parts); + const std::vector &mentions = document->GetMentions(); + + coreference_features->Initialize(instance, parts); + + // Build features for coreference arcs. + for (int r = 0; r < coreference_parts->size(); ++r) { + CoreferencePartArc *arc = + static_cast((*coreference_parts)[r]); + coreference_features->AddArcFeatures(document, r, arc->parent_mention(), + arc->child_mention()); + } +} + +void CoreferencePipe::LabelInstance(Parts *parts, + const std::vector &output, + Instance *instance) { + CoreferenceDocument *document = + static_cast(instance); + CoreferenceParts *coreference_parts = static_cast(parts); + + const std::vector &mentions = coreference_parts->GetMentions(); + std::vector mention_clusters(mentions.size(), -1); + std::vector > entities; + + double threshold = 0.5; + for (int r = 0; r < coreference_parts->size(); ++r) { + if (output[r] < threshold) continue; + CoreferencePartArc *arc = static_cast((*parts)[r]); + CHECK_EQ(mention_clusters[arc->child_mention()], -1); + if (arc->parent_mention() < 0) { + // Non-anaphoric mention; create its own cluster. + mention_clusters[arc->child_mention()] = (int)entities.size(); + entities.push_back(std::vector(1, arc->child_mention())); + } else { + int k = mention_clusters[arc->parent_mention()]; + mention_clusters[arc->child_mention()] = k; + entities[k].push_back(arc->child_mention()); + } + } + + // Clear gold coreference spans, if any. + for (int i = 0; i < document->GetNumSentences(); ++i) { + CoreferenceSentence *sentence = document->GetSentence(i); + sentence->ClearCoreferenceSpans(); + } + + // Add predicted coreference spans. + int num_entities = 0; + for (int k = 0; k < entities.size(); ++k) { + if (entities[k].size() > 1) { + ++num_entities; + } + } + for (int j = 0; j < mentions.size(); ++j) { + int k = mention_clusters[j]; + if (entities[k].size() > 1) { + // Not a singleton cluster; add coreference span. + std::ostringstream ss; + ss << k; + const std::string &name(ss.str()); + NamedSpan span(mentions[j]->start(), mentions[j]->end(), name); + int i = mentions[j]->sentence_index(); + CoreferenceSentence *sentence = document->GetSentence(i); + sentence->AddCoreferenceSpan(span); + } + } + + LOG(INFO) << "Predicted " << num_entities << " entities for " << mentions.size() + << " mentions."; +} diff --git a/src/coreference_resolver/CoreferencePipe.h b/src/coreference_resolver/CoreferencePipe.h index 8805603..4f71d27 100644 --- a/src/coreference_resolver/CoreferencePipe.h +++ b/src/coreference_resolver/CoreferencePipe.h @@ -1,175 +1,174 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef COREFERENCEPIPE_H_ -#define COREFERENCEPIPE_H_ - -#include "Pipe.h" -#include "CoreferenceOptions.h" -#include "CoreferenceReader.h" -#include "CoreferenceDictionary.h" -#include "TokenDictionary.h" -#include "CoreferenceDocumentNumeric.h" -#include "CoreferenceWriter.h" -#include "CoreferencePart.h" -#include "CoreferenceFeatures.h" -#include "CoreferenceDecoder.h" - -class CoreferencePipe : public Pipe { -public: - CoreferencePipe(Options* options) : Pipe(options) { - token_dictionary_ = NULL; - dependency_dictionary_ = NULL; - semantic_dictionary_ = NULL; - } - virtual ~CoreferencePipe() { - delete token_dictionary_; - delete dependency_dictionary_; - delete semantic_dictionary_; - } - - CoreferenceOptions *GetCoreferenceOptions() { - return static_cast(options_); - }; - CoreferenceReader *GetCoreferenceReader() { - return static_cast(reader_); - }; - CoreferenceSentenceReader *GetCoreferenceSentenceReader() { - return GetCoreferenceReader()->GetSentenceReader(); - }; - CoreferenceDictionary *GetCoreferenceDictionary() { - return static_cast(dictionary_); - }; - SemanticDictionary *GetSemanticDictionary() { - return static_cast(semantic_dictionary_); - }; - DependencyDictionary *GetDependencyDictionary() { - return static_cast(dependency_dictionary_); - }; - -protected: - void CreateDictionary() { - dictionary_ = new CoreferenceDictionary(this); - GetCoreferenceDictionary()->SetTokenDictionary(token_dictionary_); - GetCoreferenceDictionary()->SetDependencyDictionary(dependency_dictionary_); - GetCoreferenceDictionary()->SetSemanticDictionary(semantic_dictionary_); - } - void CreateReader() { reader_ = new CoreferenceReader(options_); } - void CreateWriter() { writer_ = new CoreferenceWriter(options_); } - void CreateDecoder() { decoder_ = new CoreferenceDecoder(this); } - Parts *CreateParts() { return new CoreferenceParts; } - Features *CreateFeatures() { return new CoreferenceFeatures(this); } - - void CreateTokenDictionary() { - token_dictionary_ = new TokenDictionary(this); - } - - void CreateDependencyDictionary() { - dependency_dictionary_ = new DependencyDictionary(this); - } - - void CreateSemanticDictionary() { - semantic_dictionary_ = new SemanticDictionary(this); - } - - void PreprocessData(); - - Instance *GetFormattedInstance(Instance *instance) { - CoreferenceDocumentNumeric *instance_numeric = - new CoreferenceDocumentNumeric; - // Only add gold mentions as candidates if we're training. - bool add_gold_mentions = options_->train(); - instance_numeric->Initialize(*GetCoreferenceDictionary(), - static_cast(instance), - add_gold_mentions); - return instance_numeric; - } - -protected: - void SaveModel(FILE* fs); - void LoadModel(FILE* fs); - - void MakeParts(Instance *instance, Parts *parts, - std::vector *gold_outputs); - - void MakeSelectedFeatures(Instance *instance, Parts *parts, - const std::vector &selected_parts, - Features *features); - - void ComputeScores(Instance *instance, Parts *parts, Features *features, - std::vector *scores); - - void MakeFeatureDifference(Parts *parts, - Features *features, - const std::vector &gold_output, - const std::vector &predicted_output, - FeatureVector *difference); - - void MakeGradientStep(Parts *parts, - Features *features, - double eta, - int iteration, - const std::vector &gold_output, - const std::vector &predicted_output); - - void TransformGold(Instance *instance, - Parts *parts, - const std::vector &scores, - std::vector *gold_output, - double *loss_inner); - - void LabelInstance(Parts *parts, const std::vector &output, - Instance *instance); - - void BeginEvaluation() { +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef COREFERENCEPIPE_H_ +#define COREFERENCEPIPE_H_ + +#include "Pipe.h" +#include "TimeUtils.h" +#include "CoreferenceOptions.h" +#include "CoreferenceReader.h" +#include "CoreferenceDictionary.h" +#include "TokenDictionary.h" +#include "CoreferenceDocumentNumeric.h" +#include "CoreferenceWriter.h" +#include "CoreferencePart.h" +#include "CoreferenceFeatures.h" +#include "CoreferenceDecoder.h" + +class CoreferencePipe : public Pipe { +public: + CoreferencePipe(Options* options) : Pipe(options) { + token_dictionary_ = NULL; + dependency_dictionary_ = NULL; + semantic_dictionary_ = NULL; + } + virtual ~CoreferencePipe() { + delete token_dictionary_; + delete dependency_dictionary_; + delete semantic_dictionary_; + } + + CoreferenceOptions *GetCoreferenceOptions() { + return static_cast(options_); + }; + CoreferenceReader *GetCoreferenceReader() { + return static_cast(reader_); + }; + CoreferenceSentenceReader *GetCoreferenceSentenceReader() { + return GetCoreferenceReader()->GetSentenceReader(); + }; + CoreferenceDictionary *GetCoreferenceDictionary() { + return static_cast(dictionary_); + }; + SemanticDictionary *GetSemanticDictionary() { + return static_cast(semantic_dictionary_); + }; + DependencyDictionary *GetDependencyDictionary() { + return static_cast(dependency_dictionary_); + }; + +protected: + void CreateDictionary() { + dictionary_ = new CoreferenceDictionary(this); + GetCoreferenceDictionary()->SetTokenDictionary(token_dictionary_); + GetCoreferenceDictionary()->SetDependencyDictionary(dependency_dictionary_); + GetCoreferenceDictionary()->SetSemanticDictionary(semantic_dictionary_); + } + void CreateReader() { reader_ = new CoreferenceReader(options_); } + void CreateWriter() { writer_ = new CoreferenceWriter(options_); } + void CreateDecoder() { decoder_ = new CoreferenceDecoder(this); } + Parts *CreateParts() { return new CoreferenceParts; } + Features *CreateFeatures() { return new CoreferenceFeatures(this); } + + void CreateTokenDictionary() { + token_dictionary_ = new TokenDictionary(this); + } + + void CreateDependencyDictionary() { + dependency_dictionary_ = new DependencyDictionary(this); + } + + void CreateSemanticDictionary() { + semantic_dictionary_ = new SemanticDictionary(this); + } + + void PreprocessData(); + + Instance *GetFormattedInstance(Instance *instance) { + CoreferenceDocumentNumeric *instance_numeric = + new CoreferenceDocumentNumeric; + // Only add gold mentions as candidates if we're training. + bool add_gold_mentions = options_->train(); + instance_numeric->Initialize(*GetCoreferenceDictionary(), + static_cast(instance), + add_gold_mentions); + return instance_numeric; + } + +protected: + void SaveModel(FILE* fs); + void LoadModel(FILE* fs); + + void MakeParts(Instance *instance, Parts *parts, + std::vector *gold_outputs); + + void MakeSelectedFeatures(Instance *instance, Parts *parts, + const std::vector &selected_parts, + Features *features); + + void ComputeScores(Instance *instance, Parts *parts, Features *features, + std::vector *scores); + + void MakeFeatureDifference(Parts *parts, + Features *features, + const std::vector &gold_output, + const std::vector &predicted_output, + FeatureVector *difference); + + void MakeGradientStep(Parts *parts, + Features *features, + double eta, + int iteration, + const std::vector &gold_output, + const std::vector &predicted_output); + + void TransformGold(Instance *instance, + Parts *parts, + const std::vector &scores, + std::vector *gold_output, + double *loss_inner); + + void LabelInstance(Parts *parts, const std::vector &output, + Instance *instance); + + void BeginEvaluation() { num_tokens_ = 0; - gettimeofday(&start_clock_, NULL); - } - - void EvaluateInstance(Instance *instance, - Instance *output_instance, - Parts *parts, - const vector &gold_outputs, - const vector &predicted_outputs) { - CoreferenceDocument *document = - static_cast(instance); - for (int i = 0; i < document->GetNumSentences(); ++i) { - num_tokens_ += document->GetSentence(i)->size() - 1; - } - } - + chrono.GetTime(); + } + + void EvaluateInstance(Instance *instance, + Instance *output_instance, + Parts *parts, + const vector &gold_outputs, + const vector &predicted_outputs) { + CoreferenceDocument *document = + static_cast(instance); + for (int i = 0; i < document->GetNumSentences(); ++i) { + num_tokens_ += document->GetSentence(i)->size() - 1; + } + } + void EndEvaluation() { - timeval end_clock; - gettimeofday(&end_clock, NULL); - double num_seconds = - static_cast(diff_ms(end_clock, start_clock_)) / 1000.0; - double tokens_per_second = static_cast(num_tokens_) / num_seconds; - LOG(INFO) << "Speed: " - << tokens_per_second << " tokens per second."; - } - -protected: - TokenDictionary *token_dictionary_; - DependencyDictionary *dependency_dictionary_; - SemanticDictionary *semantic_dictionary_; - //int num_tag_mistakes_; + chrono.StopTime(); + double num_seconds = chrono.GetElapsedTime(); + double tokens_per_second = static_cast(num_tokens_) / num_seconds; + LOG(INFO) << "Speed: " + << tokens_per_second << " tokens per second."; + } + +protected: + TokenDictionary *token_dictionary_; + DependencyDictionary *dependency_dictionary_; + SemanticDictionary *semantic_dictionary_; + //int num_tag_mistakes_; int num_tokens_; - timeval start_clock_; -}; - -#endif /* COREFERENCEPIPE_H_ */ + chronowrap::Chronometer chrono; +}; + +#endif /* COREFERENCEPIPE_H_ */ diff --git a/src/coreference_resolver/CoreferencePronoun.h b/src/coreference_resolver/CoreferencePronoun.h index 14c7084..76768e0 100644 --- a/src/coreference_resolver/CoreferencePronoun.h +++ b/src/coreference_resolver/CoreferencePronoun.h @@ -1,204 +1,204 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef COREFERENCEPRONOUN_H_ -#define COREFERENCEPRONOUN_H_ - -struct CoreferencePronounPerson { - enum types { - FIRST = 0, - SECOND, - THIRD, - UNDEFINED, - COUNT - }; -}; - -struct CoreferencePronounNumber { - enum types { - SINGULAR = 0, - PLURAL, - UNDEFINED, - COUNT - }; -}; - -struct CoreferencePronounGender { - enum types { - MALE = 0, - FEMALE, - NEUTRAL, - UNDEFINED, - COUNT - }; -}; - -class CoreferencePronoun { -public: - CoreferencePronoun() { ClearFlags(); } - CoreferencePronoun(const std::string &code_flags) { SetFlags(code_flags); } - virtual ~CoreferencePronoun() {} - - void Save(FILE *fs) { - bool success; - success = WriteUINT8(fs, person_flag_); - CHECK(success); - success = WriteUINT8(fs, number_flag_); - CHECK(success); - success = WriteUINT8(fs, gender_flag_); - CHECK(success); - } - - void Load(FILE *fs) { - bool success; - success = ReadUINT8(fs, &person_flag_); - CHECK(success); - success = ReadUINT8(fs, &number_flag_); - CHECK(success); - success = ReadUINT8(fs, &gender_flag_); - CHECK(success); - } - - uint8_t person_flag() { return person_flag_; } - uint8_t number_flag() { return number_flag_; } - uint8_t gender_flag() { return gender_flag_; } - - void ClearFlags() { - person_flag_ = 0x0; - number_flag_ = 0x0; - gender_flag_ = 0x0; - } - - void SetFlags(const std::string &code_flags) { - CHECK_EQ(code_flags.length(), 3); - - ClearFlags(); - char ch = code_flags[0]; // Person flag. - if (ch == '1') { - SetPersonFirst(); - } else if (ch == '2') { - SetPersonSecond(); - } else if (ch == '3') { - SetPersonThird(); - } else if (ch == 'x') { - SetPersonUndefined(); - } else { - CHECK(false) << "Invalid person flag: " << ch; - } - - ch = code_flags[1]; // Number flag. - if (ch == 's') { - SetNumberSingular(); - } else if (ch == 'p') { - SetNumberPlural(); - } else if (ch == 'x') { - SetNumberUndefined(); - } else { - CHECK(false) << "Invalid number flag: " << ch; - } - - ch = code_flags[2]; // Gender flag. - if (ch == 'm') { - SetGenderMale(); - } else if (ch == 'f') { - SetGenderFemale(); - } else if (ch == 'n') { - SetGenderNeutral(); - } else if (ch == 'x') { - SetGenderUndefined(); - } else { - CHECK(false) << "Invalid gender flag: " << ch; - } - } - -public: - bool IsPersonFirst() { - return person_flag_ & (0x1 << CoreferencePronounPerson::FIRST); - } - bool IsPersonSecond() { - return person_flag_ & (0x1 << CoreferencePronounPerson::SECOND); - } - bool IsPersonThird() { - return person_flag_ & (0x1 << CoreferencePronounPerson::THIRD); - } - bool IsPersonUndefined() { - return person_flag_ & (0x1 << CoreferencePronounPerson::UNDEFINED); - } - bool IsNumberSingular() { - return number_flag_ & (0x1 << CoreferencePronounNumber::SINGULAR); - } - bool IsNumberPlural() { - return number_flag_ & (0x1 << CoreferencePronounNumber::PLURAL); - } - bool IsNumberUndefined() { - return number_flag_ & (0x1 << CoreferencePronounNumber::UNDEFINED); - } - bool IsGenderMale() { - return gender_flag_ & (0x1 << CoreferencePronounGender::MALE); - } - bool IsGenderFemale() { - return gender_flag_ & (0x1 << CoreferencePronounGender::FEMALE); - } - bool IsGenderNeutral() { - return gender_flag_ & (0x1 << CoreferencePronounGender::NEUTRAL); - } - bool IsGenderUndefined() { - return gender_flag_ & (0x1 << CoreferencePronounGender::UNDEFINED); - } - - void SetPersonFirst() { - person_flag_ |= (0x1 << CoreferencePronounPerson::FIRST); - } - void SetPersonSecond() { - person_flag_ |= (0x1 << CoreferencePronounPerson::SECOND); - } - void SetPersonThird() { - person_flag_ |= (0x1 << CoreferencePronounPerson::THIRD); - } - void SetPersonUndefined() { - person_flag_ |= (0x1 << CoreferencePronounPerson::UNDEFINED); - } - void SetNumberSingular() { - number_flag_ |= (0x1 << CoreferencePronounNumber::SINGULAR); - } - void SetNumberPlural() { - number_flag_ |= (0x1 << CoreferencePronounNumber::PLURAL); - } - void SetNumberUndefined() { - number_flag_ |= (0x1 << CoreferencePronounNumber::UNDEFINED); - } - void SetGenderMale() { - gender_flag_ |= (0x1 << CoreferencePronounGender::MALE); - } - void SetGenderFemale() { - gender_flag_ |= (0x1 << CoreferencePronounGender::FEMALE); - } - void SetGenderNeutral() { - gender_flag_ |= (0x1 << CoreferencePronounGender::NEUTRAL); - } - void SetGenderUndefined() { - gender_flag_ |= (0x1 << CoreferencePronounGender::UNDEFINED); - } - -protected: - uint8_t person_flag_; - uint8_t number_flag_; - uint8_t gender_flag_; -}; - -#endif /* COREFERENCEPRONOUN_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef COREFERENCEPRONOUN_H_ +#define COREFERENCEPRONOUN_H_ + +struct CoreferencePronounPerson { + enum types { + FIRST = 0, + SECOND, + THIRD, + UNDEFINED, + COUNT + }; +}; + +struct CoreferencePronounNumber { + enum types { + SINGULAR = 0, + PLURAL, + UNDEFINED, + COUNT + }; +}; + +struct CoreferencePronounGender { + enum types { + MALE = 0, + FEMALE, + NEUTRAL, + UNDEFINED, + COUNT + }; +}; + +class CoreferencePronoun { +public: + CoreferencePronoun() { ClearFlags(); } + CoreferencePronoun(const std::string &code_flags) { SetFlags(code_flags); } + virtual ~CoreferencePronoun() {} + + void Save(FILE *fs) { + bool success; + success = WriteUINT8(fs, person_flag_); + CHECK(success); + success = WriteUINT8(fs, number_flag_); + CHECK(success); + success = WriteUINT8(fs, gender_flag_); + CHECK(success); + } + + void Load(FILE *fs) { + bool success; + success = ReadUINT8(fs, &person_flag_); + CHECK(success); + success = ReadUINT8(fs, &number_flag_); + CHECK(success); + success = ReadUINT8(fs, &gender_flag_); + CHECK(success); + } + + uint8_t person_flag() { return person_flag_; } + uint8_t number_flag() { return number_flag_; } + uint8_t gender_flag() { return gender_flag_; } + + void ClearFlags() { + person_flag_ = 0x0; + number_flag_ = 0x0; + gender_flag_ = 0x0; + } + + void SetFlags(const std::string &code_flags) { + CHECK_EQ(code_flags.length(), 3); + + ClearFlags(); + char ch = code_flags[0]; // Person flag. + if (ch == '1') { + SetPersonFirst(); + } else if (ch == '2') { + SetPersonSecond(); + } else if (ch == '3') { + SetPersonThird(); + } else if (ch == 'x') { + SetPersonUndefined(); + } else { + CHECK(false) << "Invalid person flag: " << ch; + } + + ch = code_flags[1]; // Number flag. + if (ch == 's') { + SetNumberSingular(); + } else if (ch == 'p') { + SetNumberPlural(); + } else if (ch == 'x') { + SetNumberUndefined(); + } else { + CHECK(false) << "Invalid number flag: " << ch; + } + + ch = code_flags[2]; // Gender flag. + if (ch == 'm') { + SetGenderMale(); + } else if (ch == 'f') { + SetGenderFemale(); + } else if (ch == 'n') { + SetGenderNeutral(); + } else if (ch == 'x') { + SetGenderUndefined(); + } else { + CHECK(false) << "Invalid gender flag: " << ch; + } + } + +public: + bool IsPersonFirst() { + return ( person_flag_ & (0x1 << CoreferencePronounPerson::FIRST) ) != 0; + } + bool IsPersonSecond() { + return (person_flag_ & (0x1 << CoreferencePronounPerson::SECOND) ) != 0; + } + bool IsPersonThird() { + return (person_flag_ & (0x1 << CoreferencePronounPerson::THIRD) ) != 0; + } + bool IsPersonUndefined() { + return (person_flag_ & (0x1 << CoreferencePronounPerson::UNDEFINED) ) != 0; + } + bool IsNumberSingular() { + return (number_flag_ & (0x1 << CoreferencePronounNumber::SINGULAR) ) != 0; + } + bool IsNumberPlural() { + return (number_flag_ & (0x1 << CoreferencePronounNumber::PLURAL) ) != 0; + } + bool IsNumberUndefined() { + return (number_flag_ & (0x1 << CoreferencePronounNumber::UNDEFINED) ) != 0; + } + bool IsGenderMale() { + return (gender_flag_ & (0x1 << CoreferencePronounGender::MALE) ) != 0; + } + bool IsGenderFemale() { + return (gender_flag_ & (0x1 << CoreferencePronounGender::FEMALE) ) != 0; + } + bool IsGenderNeutral() { + return (gender_flag_ & (0x1 << CoreferencePronounGender::NEUTRAL) ) != 0; + } + bool IsGenderUndefined() { + return (gender_flag_ & (0x1 << CoreferencePronounGender::UNDEFINED) ) != 0; + } + + void SetPersonFirst() { + person_flag_ |= (0x1 << CoreferencePronounPerson::FIRST); + } + void SetPersonSecond() { + person_flag_ |= (0x1 << CoreferencePronounPerson::SECOND); + } + void SetPersonThird() { + person_flag_ |= (0x1 << CoreferencePronounPerson::THIRD); + } + void SetPersonUndefined() { + person_flag_ |= (0x1 << CoreferencePronounPerson::UNDEFINED); + } + void SetNumberSingular() { + number_flag_ |= (0x1 << CoreferencePronounNumber::SINGULAR); + } + void SetNumberPlural() { + number_flag_ |= (0x1 << CoreferencePronounNumber::PLURAL); + } + void SetNumberUndefined() { + number_flag_ |= (0x1 << CoreferencePronounNumber::UNDEFINED); + } + void SetGenderMale() { + gender_flag_ |= (0x1 << CoreferencePronounGender::MALE); + } + void SetGenderFemale() { + gender_flag_ |= (0x1 << CoreferencePronounGender::FEMALE); + } + void SetGenderNeutral() { + gender_flag_ |= (0x1 << CoreferencePronounGender::NEUTRAL); + } + void SetGenderUndefined() { + gender_flag_ |= (0x1 << CoreferencePronounGender::UNDEFINED); + } + +protected: + uint8_t person_flag_; + uint8_t number_flag_; + uint8_t gender_flag_; +}; + +#endif /* COREFERENCEPRONOUN_H_ */ diff --git a/src/coreference_resolver/CoreferenceReader.cpp b/src/coreference_resolver/CoreferenceReader.cpp index ed44d8c..8fd7855 100644 --- a/src/coreference_resolver/CoreferenceReader.cpp +++ b/src/coreference_resolver/CoreferenceReader.cpp @@ -1,408 +1,408 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "CoreferenceReader.h" -#include "CoreferenceOptions.h" -#include "Utils.h" -#include -#include -#include - -Instance *CoreferenceReader::GetNext() { - int num_sentences = 0; - std::string line; - std::string name = ""; - int part_number = 0; - bool found_begin = false; - bool found_end = false; - if (is_.is_open()) { - while (!is_.eof()) { - getline(is_, line); - // E.g. "#begin document (nw/wsj/02/wsj_0242); part 000" - if (0 == line.substr(0, 6).compare("#begin")) { - name = line; - part_number = 0; - // Extract name. - size_t start = line.find("("); - size_t end = line.find(")"); - CHECK_NE(start, std::string::npos); - CHECK_NE(end, std::string::npos); - CHECK_LT(start, end); - CHECK_LT(end + 1, line.size()); - CHECK_EQ(line[end + 1], ';'); - name = line.substr(start + 1, end - start - 1); // Document id. - // Extract part number. - start = line.find("part "); - CHECK_NE(start, std::string::npos); - start += 5; - end = line.find_first_not_of("0123456789", start); - if (end == std::string::npos) end = line.length(); - CHECK_LT(start, end); - std::string part_name = line.substr(start, end - start); - std::stringstream ss(part_name); - ss >> part_number; // Document part number. - - //LOG(INFO) << "Document: " << name << " Part: " << part_number; - CHECK(!found_begin); - CHECK(!found_end); - found_begin = true; - } else if (0 == line.substr(0, 4).compare("#end")) { - // End of document. - CHECK(found_begin); - CHECK(!found_end); - found_end = true; - break; - } else if (line.length() == 0) { - // End of sentence; update counter. - if (found_begin) ++num_sentences; - } - } - } - - // Now read all the sentences in this document. - std::vector sentences; - for (int i = 0; i < num_sentences; ++i) { - // Start reading sentences. - //LOG(INFO) << "Reading sentence #" << i << "..."; - CoreferenceSentence *sentence = - static_cast(sentence_reader_.GetNext()); - CHECK(sentence); - sentences.push_back(sentence); - } - - // Create document instance. - if (num_sentences == 0 && is_.is_open() && is_.eof()) return NULL; - - //LOG(INFO) << "Found " << num_sentences << " sentences in document " << name << "."; - - CoreferenceDocument *instance = new CoreferenceDocument; - instance->Initialize(name, part_number, sentences); - - return static_cast(instance); -} - -Instance *CoreferenceSentenceReader::GetNext() { - // Fill all fields for the entire sentence. - std::string name = ""; - //int part_number = 0; - std::vector > sentence_fields; - std::string line; - - if (is_.is_open()) { - while (!is_.eof()) { - getline(is_, line); - if (line.length() <= 0) break; - if (0 == line.substr(0, 1).compare("#")) { - continue; - } - //LOG(INFO) << line; - std::vector fields; - StringSplit(line, "\t ", &fields, true); - sentence_fields.push_back(fields); - } - } - - bool read_next_sentence = false; - if (!is_.eof()) read_next_sentence = true; - - // Sentence length. - int length = sentence_fields.size(); - - //LOG(INFO) << "Sentence has length " << length; - - // Convert to array of forms, lemmas, etc. - // Note: the first token is the root symbol. - std::vector forms(length + 1); - std::vector lemmas(length + 1); - std::vector cpos(length + 1); - std::vector pos(length + 1); - std::vector > feats(length + 1); - std::vector deprels(length + 1); - std::vector heads(length + 1); - std::vector predicate_names; // Names of predicates (e.g. "take.01"). - std::vector predicate_indices; // Positions of each predicate in the sentence. - std::vector > argument_roles; // Semantic roles. - std::vector > argument_indices; // Positions of each argument. - std::vector parse_info(length + 1); - std::vector author_info(length + 1); - std::vector entity_info(length + 1); - std::vector coreference_info(length + 1); - - forms[0] = "_root_"; - lemmas[0] = "_root_"; - cpos[0] = "_root_"; - pos[0] = "_root_"; - deprels[0] = "_root_"; - heads[0] = -1; - feats[0] = std::vector(1, "_root_"); - parse_info[0] = "*"; - author_info[0] = "__"; - entity_info[0] = "*"; - coreference_info[0] = "-"; - - int num_predicates = 0; - for (int i = 0; i < length; i++) { - const vector &info = sentence_fields[i]; - - int offset = 0; - if (i == 0) { - name = info[offset] + "\t" + info[offset + 1]; // Document name and part. - } - offset += 3; - - // Use splitted forms. - forms[i + 1] = info[offset]; - lemmas[i + 1] = "_"; - ++offset; - cpos[i + 1] = info[offset]; - pos[i + 1] = info[offset]; // No distiction between pos and cpos. - ++offset; - parse_info[i + 1] = info[offset]; // Parse span (e.g. "(NP (NN *") - ++offset; - - // No morpho-syntactic information. - feats[i + 1].clear(); - - // Dependency syntactic information. - if (false) { //!semantic_options->use_dependency_syntactic_features()) { - heads[i + 1] = 0; - deprels[i + 1] = "NULL"; - } else { - std::stringstream ss(info[offset]); - ++offset; - ss >> heads[i + 1]; - //++heads[i+1]; // Note: heads start at -1 here! - deprels[i + 1] = info[offset]; - ++offset; - - if (heads[i + 1] < 0 || heads[i + 1] > length) { - LOG(INFO) << "Invalid value of head (" << heads[i + 1] - << ") not in range [0.." << length - << "] - attaching to the root."; - heads[i + 1] = 0; - } - } - - // Semantic role labeling information. - bool is_predicate = false; - std::string predicate_name = info[offset]; - ++offset; - std::string predicate_sense = info[offset]; - ++offset; - // Added the comparison below due to the CONLL 2012 data. - if (0 != predicate_sense.compare("_") && - 0 != predicate_sense.compare("-")) { - predicate_name += "." + predicate_sense; // Predicate lemma+sense in PropBank. - is_predicate = true; - } - // if (0 != predicate_name.compare("_") && - // 0 != predicate_name.compare("-")) { - //} - - std::string word_sense = info[offset]; - ++offset; - author_info[i + 1] = info[offset]; - ++offset; - entity_info[i + 1] = info[offset]; - ++offset; - - if (i == 0) { - // Allocate space for predicates. - num_predicates = info.size() - 1 - offset; - //LOG(INFO) << num_predicates; - // Top nodes will be considered arguments of a special root node. - if (use_top_nodes_) ++num_predicates; - //LOG(INFO) << num_predicates; - predicate_names.resize(num_predicates); - predicate_indices.resize(num_predicates); - argument_roles.resize(num_predicates); - argument_indices.resize(num_predicates); - num_predicates = 0; - if (use_top_nodes_) { - predicate_names[num_predicates] = "__ROOT__"; - predicate_indices[num_predicates] = 0; - ++num_predicates; - } - } - - if (is_predicate) { - //LOG(INFO) << predicate_name; - CHECK_LT(num_predicates, predicate_names.size()); - predicate_names[num_predicates] = predicate_name; - predicate_indices[num_predicates] = i + 1; - ++num_predicates; - } - - // Note: this is assuming arguments are encoded as dependents. - // However, in some datasets (e.g. Ontonotes) arguments are encoded as - // spans. - for (int j = offset; j < info.size() - 1; ++j) { - string argument_role = info[j]; - bool is_argument = false; - if (0 != argument_role.compare("_")) is_argument = true; - if (is_argument) { - int k = j - offset; - if (use_top_nodes_) ++k; - argument_roles[k].push_back(argument_role); - argument_indices[k].push_back(i + 1); - } - } - offset = info.size() - 1; - - // Add coreference information. - coreference_info[i + 1] = info[offset]; - ++offset; - } - - CHECK_EQ(num_predicates, predicate_names.size()); - - // TODO: Create spans... - std::vector entity_spans; - std::vector constituent_spans; - std::vector coreference_spans; - - ConstructSpansFromText(entity_info, &entity_spans); - ConstructSpansFromText(parse_info, &constituent_spans); - ConstructCoreferenceSpansFromText(coreference_info, &coreference_spans); - -#if 0 - for (int k = 0; k < entity_spans.size(); ++k) { - LOG(INFO) << "Entity " << entity_spans[k]->name() - << "(" << entity_spans[k]->start() - << "," << entity_spans[k]->end() - << ")"; - } - for (int k = 0; k < constituent_spans.size(); ++k) { - LOG(INFO) << "Constituent " << constituent_spans[k]->name() - << "(" << constituent_spans[k]->start() - << "," << constituent_spans[k]->end() - << ")"; - } - for (int k = 0; k < coreference_spans.size(); ++k) { - LOG(INFO) << "Mention " << coreference_spans[k]->name() - << " (" << coreference_spans[k]->start() - << "," << coreference_spans[k]->end() - << ")"; - } -#endif - - CoreferenceSentence *instance = NULL; - if (read_next_sentence && length >= 0) { - instance = new CoreferenceSentence; - instance->Initialize(name, forms, lemmas, cpos, pos, feats, deprels, heads, - predicate_names, predicate_indices, argument_roles, - argument_indices, author_info, entity_spans, - constituent_spans, coreference_spans); - } - - return static_cast(instance); -} - -void CoreferenceSentenceReader::ConstructSpansFromText( - const std::vector &span_lines, - std::vector *spans) { - char left_bracket = '('; - char right_bracket = ')'; - std::string characters_to_ignore = "*-"; - std::string name = ""; - std::stack span_names_stack; - std::stack span_start_stack; - - for (int i = 0; i < span_lines.size(); ++i) { - std::string line = span_lines[i]; - //LOG(INFO) << line; - for (int j = 0; j < line.length(); ++j) { - char ch = line[j]; - if (ch == left_bracket) { - name = ""; - span_start_stack.push(i); - } else if (ch == right_bracket) { - name = span_names_stack.top(); - int start_position = span_start_stack.top(); - int end_position = i; - span_names_stack.pop(); - span_start_stack.pop(); - NamedSpan *span = new NamedSpan(start_position, end_position, name); - spans->push_back(span); - } else if (characters_to_ignore.find(ch) != std::string::npos) { - continue; - } else { - name += ch; - if (j + 1 >= line.length() || - line[j + 1] == left_bracket || line[j + 1] == right_bracket || - characters_to_ignore.find(line[j + 1]) != std::string::npos) { - span_names_stack.push(name); - } - } - } - } - - CHECK_EQ(span_names_stack.size(), 0); - CHECK_EQ(span_start_stack.size(), 0); -} - -void CoreferenceSentenceReader::ConstructCoreferenceSpansFromText( - const std::vector &span_lines, - std::vector *spans) { - char left_bracket = '('; - char right_bracket = ')'; - std::string characters_to_ignore = "*-"; - std::string name = ""; - - for (int i = 0; i < span_lines.size(); ++i) { - std::string line = span_lines[i]; - std::vector fields; - StringSplit(line, "|", &fields, true); - //LOG(INFO) << "Span " << i << ": " << line; - for (int j = 0; j < fields.size(); ++j) { - std::string field = fields[j]; - CHECK_GE(field.length(), 1); - char first_ch = field[0]; - char last_ch = field[field.length() - 1]; - if (first_ch == left_bracket && last_ch == right_bracket) { - int start_position = i; - int end_position = i; - name = field.substr(1, field.length() - 2); - NamedSpan *span = new NamedSpan(start_position, end_position, name); - spans->push_back(span); - } else if (first_ch == left_bracket) { - int start_position = i; - int end_position = -1; - name = field.substr(1, field.length() - 1); - NamedSpan *span = new NamedSpan(start_position, end_position, name); - spans->push_back(span); - } else if (last_ch == right_bracket) { - name = field.substr(0, field.length() - 1); - NamedSpan *selected_span = NULL; - for (int k = spans->size() - 1; k >= 0; --k) { - if ((*spans)[k]->name() == name && (*spans)[k]->end() < 0) { - CHECK(!selected_span); - selected_span = (*spans)[k]; - break; - } - } - CHECK(selected_span); - selected_span->set_end(i); - } - } - } - - for (int k = 0; k < spans->size(); ++k) { - CHECK_GE((*spans)[k]->end(), 0); - } -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "CoreferenceReader.h" +#include "CoreferenceOptions.h" +#include "Utils.h" +#include +#include +#include + +Instance *CoreferenceReader::GetNext() { + int num_sentences = 0; + std::string line; + std::string name = ""; + int part_number = 0; + bool found_begin = false; + bool found_end = false; + if (is_.is_open()) { + while (!is_.eof()) { + getline(is_, line); + // E.g. "#begin document (nw/wsj/02/wsj_0242); part 000" + if (0 == line.substr(0, 6).compare("#begin")) { + name = line; + part_number = 0; + // Extract name. + size_t start = line.find("("); + size_t end = line.find(")"); + CHECK_NE(start, std::string::npos); + CHECK_NE(end, std::string::npos); + CHECK_LT(start, end); + CHECK_LT(end + 1, line.size()); + CHECK_EQ(line[end + 1], ';'); + name = line.substr(start + 1, end - start - 1); // Document id. + // Extract part number. + start = line.find("part "); + CHECK_NE(start, std::string::npos); + start += 5; + end = line.find_first_not_of("0123456789", start); + if (end == std::string::npos) end = line.length(); + CHECK_LT(start, end); + std::string part_name = line.substr(start, end - start); + std::stringstream ss(part_name); + ss >> part_number; // Document part number. + + //LOG(INFO) << "Document: " << name << " Part: " << part_number; + CHECK(!found_begin); + CHECK(!found_end); + found_begin = true; + } else if (0 == line.substr(0, 4).compare("#end")) { + // End of document. + CHECK(found_begin); + CHECK(!found_end); + found_end = true; + break; + } else if (line.length() == 0) { + // End of sentence; update counter. + if (found_begin) ++num_sentences; + } + } + } + + // Now read all the sentences in this document. + std::vector sentences; + for (int i = 0; i < num_sentences; ++i) { + // Start reading sentences. + //LOG(INFO) << "Reading sentence #" << i << "..."; + CoreferenceSentence *sentence = + static_cast(sentence_reader_.GetNext()); + CHECK(sentence); + sentences.push_back(sentence); + } + + // Create document instance. + if (num_sentences == 0 && is_.is_open() && is_.eof()) return NULL; + + //LOG(INFO) << "Found " << num_sentences << " sentences in document " << name << "."; + + CoreferenceDocument *instance = new CoreferenceDocument; + instance->Initialize(name, part_number, sentences); + + return static_cast(instance); +} + +Instance *CoreferenceSentenceReader::GetNext() { + // Fill all fields for the entire sentence. + std::string name = ""; + //int part_number = 0; + std::vector > sentence_fields; + std::string line; + + if (is_.is_open()) { + while (!is_.eof()) { + getline(is_, line); + if (line.length() <= 0) break; + if (0 == line.substr(0, 1).compare("#")) { + continue; + } + //LOG(INFO) << line; + std::vector fields; + StringSplit(line, "\t ", &fields, true); + sentence_fields.push_back(fields); + } + } + + bool read_next_sentence = false; + if (!is_.eof()) read_next_sentence = true; + + // Sentence length. + int length = (int)sentence_fields.size(); + + //LOG(INFO) << "Sentence has length " << length; + + // Convert to array of forms, lemmas, etc. + // Note: the first token is the root symbol. + std::vector forms(length + 1); + std::vector lemmas(length + 1); + std::vector cpos(length + 1); + std::vector pos(length + 1); + std::vector > feats(length + 1); + std::vector deprels(length + 1); + std::vector heads(length + 1); + std::vector predicate_names; // Names of predicates (e.g. "take.01"). + std::vector predicate_indices; // Positions of each predicate in the sentence. + std::vector > argument_roles; // Semantic roles. + std::vector > argument_indices; // Positions of each argument. + std::vector parse_info(length + 1); + std::vector author_info(length + 1); + std::vector entity_info(length + 1); + std::vector coreference_info(length + 1); + + forms[0] = "_root_"; + lemmas[0] = "_root_"; + cpos[0] = "_root_"; + pos[0] = "_root_"; + deprels[0] = "_root_"; + heads[0] = -1; + feats[0] = std::vector(1, "_root_"); + parse_info[0] = "*"; + author_info[0] = "__"; + entity_info[0] = "*"; + coreference_info[0] = "-"; + + int num_predicates = 0; + for (int i = 0; i < length; i++) { + const vector &info = sentence_fields[i]; + + int offset = 0; + if (i == 0) { + name = info[offset] + "\t" + info[offset + 1]; // Document name and part. + } + offset += 3; + + // Use splitted forms. + forms[i + 1] = info[offset]; + lemmas[i + 1] = "_"; + ++offset; + cpos[i + 1] = info[offset]; + pos[i + 1] = info[offset]; // No distiction between pos and cpos. + ++offset; + parse_info[i + 1] = info[offset]; // Parse span (e.g. "(NP (NN *") + ++offset; + + // No morpho-syntactic information. + feats[i + 1].clear(); + + // Dependency syntactic information. + if (false) { //!semantic_options->use_dependency_syntactic_features()) { + heads[i + 1] = 0; + deprels[i + 1] = "NULL"; + } else { + std::stringstream ss(info[offset]); + ++offset; + ss >> heads[i + 1]; + //++heads[i+1]; // Note: heads start at -1 here! + deprels[i + 1] = info[offset]; + ++offset; + + if (heads[i + 1] < 0 || heads[i + 1] > length) { + LOG(INFO) << "Invalid value of head (" << heads[i + 1] + << ") not in range [0.." << length + << "] - attaching to the root."; + heads[i + 1] = 0; + } + } + + // Semantic role labeling information. + bool is_predicate = false; + std::string predicate_name = info[offset]; + ++offset; + std::string predicate_sense = info[offset]; + ++offset; + // Added the comparison below due to the CONLL 2012 data. + if (0 != predicate_sense.compare("_") && + 0 != predicate_sense.compare("-")) { + predicate_name += "." + predicate_sense; // Predicate lemma+sense in PropBank. + is_predicate = true; + } + // if (0 != predicate_name.compare("_") && + // 0 != predicate_name.compare("-")) { + //} + + std::string word_sense = info[offset]; + ++offset; + author_info[i + 1] = info[offset]; + ++offset; + entity_info[i + 1] = info[offset]; + ++offset; + + if (i == 0) { + // Allocate space for predicates. + num_predicates = (int)info.size() - 1 - offset; + //LOG(INFO) << num_predicates; + // Top nodes will be considered arguments of a special root node. + if (use_top_nodes_) ++num_predicates; + //LOG(INFO) << num_predicates; + predicate_names.resize(num_predicates); + predicate_indices.resize(num_predicates); + argument_roles.resize(num_predicates); + argument_indices.resize(num_predicates); + num_predicates = 0; + if (use_top_nodes_) { + predicate_names[num_predicates] = "__ROOT__"; + predicate_indices[num_predicates] = 0; + ++num_predicates; + } + } + + if (is_predicate) { + //LOG(INFO) << predicate_name; + CHECK_LT(num_predicates, predicate_names.size()); + predicate_names[num_predicates] = predicate_name; + predicate_indices[num_predicates] = i + 1; + ++num_predicates; + } + + // Note: this is assuming arguments are encoded as dependents. + // However, in some datasets (e.g. Ontonotes) arguments are encoded as + // spans. + for (int j = offset; j < info.size() - 1; ++j) { + string argument_role = info[j]; + bool is_argument = false; + if (0 != argument_role.compare("_")) is_argument = true; + if (is_argument) { + int k = j - offset; + if (use_top_nodes_) ++k; + argument_roles[k].push_back(argument_role); + argument_indices[k].push_back(i + 1); + } + } + offset = (int)info.size() - 1; + + // Add coreference information. + coreference_info[i + 1] = info[offset]; + ++offset; + } + + CHECK_EQ(num_predicates, predicate_names.size()); + + // TODO: Create spans... + std::vector entity_spans; + std::vector constituent_spans; + std::vector coreference_spans; + + ConstructSpansFromText(entity_info, &entity_spans); + ConstructSpansFromText(parse_info, &constituent_spans); + ConstructCoreferenceSpansFromText(coreference_info, &coreference_spans); + +#if 0 + for (int k = 0; k < entity_spans.size(); ++k) { + LOG(INFO) << "Entity " << entity_spans[k]->name() + << "(" << entity_spans[k]->start() + << "," << entity_spans[k]->end() + << ")"; + } + for (int k = 0; k < constituent_spans.size(); ++k) { + LOG(INFO) << "Constituent " << constituent_spans[k]->name() + << "(" << constituent_spans[k]->start() + << "," << constituent_spans[k]->end() + << ")"; + } + for (int k = 0; k < coreference_spans.size(); ++k) { + LOG(INFO) << "Mention " << coreference_spans[k]->name() + << " (" << coreference_spans[k]->start() + << "," << coreference_spans[k]->end() + << ")"; + } +#endif + + CoreferenceSentence *instance = NULL; + if (read_next_sentence && length >= 0) { + instance = new CoreferenceSentence; + instance->Initialize(name, forms, lemmas, cpos, pos, feats, deprels, heads, + predicate_names, predicate_indices, argument_roles, + argument_indices, author_info, entity_spans, + constituent_spans, coreference_spans); + } + + return static_cast(instance); +} + +void CoreferenceSentenceReader::ConstructSpansFromText( + const std::vector &span_lines, + std::vector *spans) { + char left_bracket = '('; + char right_bracket = ')'; + std::string characters_to_ignore = "*-"; + std::string name = ""; + std::stack span_names_stack; + std::stack span_start_stack; + + for (int i = 0; i < span_lines.size(); ++i) { + std::string line = span_lines[i]; + //LOG(INFO) << line; + for (int j = 0; j < line.length(); ++j) { + char ch = line[j]; + if (ch == left_bracket) { + name = ""; + span_start_stack.push(i); + } else if (ch == right_bracket) { + name = span_names_stack.top(); + int start_position = span_start_stack.top(); + int end_position = i; + span_names_stack.pop(); + span_start_stack.pop(); + NamedSpan *span = new NamedSpan(start_position, end_position, name); + spans->push_back(span); + } else if (characters_to_ignore.find(ch) != std::string::npos) { + continue; + } else { + name += ch; + if (j + 1 >= line.length() || + line[j + 1] == left_bracket || line[j + 1] == right_bracket || + characters_to_ignore.find(line[j + 1]) != std::string::npos) { + span_names_stack.push(name); + } + } + } + } + + CHECK_EQ(span_names_stack.size(), 0); + CHECK_EQ(span_start_stack.size(), 0); +} + +void CoreferenceSentenceReader::ConstructCoreferenceSpansFromText( + const std::vector &span_lines, + std::vector *spans) { + char left_bracket = '('; + char right_bracket = ')'; + std::string characters_to_ignore = "*-"; + std::string name = ""; + + for (int i = 0; i < span_lines.size(); ++i) { + std::string line = span_lines[i]; + std::vector fields; + StringSplit(line, "|", &fields, true); + //LOG(INFO) << "Span " << i << ": " << line; + for (int j = 0; j < fields.size(); ++j) { + std::string field = fields[j]; + CHECK_GE(field.length(), 1); + char first_ch = field[0]; + char last_ch = field[field.length() - 1]; + if (first_ch == left_bracket && last_ch == right_bracket) { + int start_position = i; + int end_position = i; + name = field.substr(1, field.length() - 2); + NamedSpan *span = new NamedSpan(start_position, end_position, name); + spans->push_back(span); + } else if (first_ch == left_bracket) { + int start_position = i; + int end_position = -1; + name = field.substr(1, field.length() - 1); + NamedSpan *span = new NamedSpan(start_position, end_position, name); + spans->push_back(span); + } else if (last_ch == right_bracket) { + name = field.substr(0, field.length() - 1); + NamedSpan *selected_span = NULL; + for (int k = spans->size() - 1; k >= 0; --k) { + if ((*spans)[k]->name() == name && (*spans)[k]->end() < 0) { + CHECK(!selected_span); + selected_span = (*spans)[k]; + break; + } + } + CHECK(selected_span); + selected_span->set_end(i); + } + } + } + + for (int k = 0; k < spans->size(); ++k) { + CHECK_GE((*spans)[k]->end(), 0); + } +} diff --git a/src/coreference_resolver/CoreferenceWriter.cpp b/src/coreference_resolver/CoreferenceWriter.cpp index a913fd1..92fd854 100644 --- a/src/coreference_resolver/CoreferenceWriter.cpp +++ b/src/coreference_resolver/CoreferenceWriter.cpp @@ -43,6 +43,8 @@ void CoreferenceWriter::Write(Instance *instance) { os_ << "#end document" << std::endl; } +void CoreferenceWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} + void CoreferenceSentenceWriter::Write(Instance *instance) { CoreferenceSentence *sentence = static_cast(instance); std::ofstream *os = (external_os_) ? external_os_ : &os_; @@ -85,6 +87,8 @@ void CoreferenceSentenceWriter::Write(Instance *instance) { *os << std::endl; } +void CoreferenceSentenceWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} + #if 0 void CoreferenceSentenceReader::ConstructSpansFromText( const std::vector &span_lines, diff --git a/src/coreference_resolver/CoreferenceWriter.h b/src/coreference_resolver/CoreferenceWriter.h index 1279e79..eccfdbb 100644 --- a/src/coreference_resolver/CoreferenceWriter.h +++ b/src/coreference_resolver/CoreferenceWriter.h @@ -41,6 +41,7 @@ class CoreferenceSentenceWriter : public SemanticWriter { public: void SetOutputStream(std::ofstream *os) { external_os_ = os; } void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); void set_options(Options *options) { options_ = options; } protected: @@ -71,6 +72,7 @@ class CoreferenceWriter : public Writer { Writer::Close(); } void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); CoreferenceSentenceWriter *GetSentenceWriter() { return &sentence_writer_; } diff --git a/src/coreference_resolver/Mention.cpp b/src/coreference_resolver/Mention.cpp index 1ebc548..1930586 100644 --- a/src/coreference_resolver/Mention.cpp +++ b/src/coreference_resolver/Mention.cpp @@ -58,8 +58,8 @@ void Mention::ComputeProperties(const CoreferenceDictionary &dictionary, const std::string &word = instance->GetForm(head_index_); std::string word_lower(word); - transform(word_lower.begin(), word_lower.end(), word_lower.begin(), - ::tolower); + std::transform(word_lower.begin(), word_lower.end(), word_lower.begin(), + ::tolower); int id = dictionary.GetWordLowerAlphabet().Lookup(word_lower); //if (id < 0) id = TOKEN_UNKNOWN; int head_word = id; // This uses the extended word dictionary. @@ -106,8 +106,8 @@ void Mention::ComputeProperties(const CoreferenceDictionary &dictionary, if (i >= 0) { const std::string &word = instance->GetForm(i); std::string word_lower(word); - transform(word_lower.begin(), word_lower.end(), word_lower.begin(), - ::tolower); + std::transform(word_lower.begin(), word_lower.end(), word_lower.begin(), + ::tolower); int word_lower_id = dictionary.GetWordLowerAlphabet().Lookup(word_lower); number_ = MentionNumber::UNKNOWN; @@ -141,8 +141,8 @@ void Mention::ComputeProperties(const CoreferenceDictionary &dictionary, for (int i = start_; i <= end_; ++i) { const std::string &word = instance->GetForm(i); std::string word_lower(word); - transform(word_lower.begin(), word_lower.end(), word_lower.begin(), - ::tolower); + std::transform(word_lower.begin(), word_lower.end(), word_lower.begin(), + ::tolower); int id = dictionary.GetWordAlphabet().Lookup(word); //if (id < 0) id = TOKEN_UNKNOWN; phrase.push_back(id); @@ -274,4 +274,4 @@ void Mention::ComputeHead() { if (!ContainsIndex(sentence_->GetHead(i))) head_index_ = i; } if (head_index_ < 0) head_index_ = end_; -} +} diff --git a/src/coreference_resolver/Mention.h b/src/coreference_resolver/Mention.h index f782d85..03042f3 100644 --- a/src/coreference_resolver/Mention.h +++ b/src/coreference_resolver/Mention.h @@ -1,187 +1,187 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef MENTION_H_ -#define MENTION_H_ - -#include "EntitySpan.h" -#include "CoreferenceDictionary.h" - -struct MentionType { - enum { - PRONOMINAL = 0, - PROPER, - NOMINAL, - NUM_MENTION_TYPES - }; -}; - -struct MentionGender { - enum { - MALE = 0, - FEMALE, - NEUTRAL, - UNKNOWN, - NUM_MENTION_GENDERS - }; -}; - -struct MentionNumber { - enum { - SINGULAR = 0, - PLURAL, - UNKNOWN, - NUM_MENTION_NUMBERS - }; -}; - -class CoreferenceSentenceNumeric; -class CoreferenceSentence; - -class Mention : public NumericSpan { -public: - Mention() { sentence_ = NULL; } - Mention(int start, int end, int id) : NumericSpan(start, end, id) {} - virtual ~Mention() {} - - int type() const { return type_; } - int gender() const { return gender_; } - int number() const { return number_; } - int unigram_ancestry() const { return unigram_ancestry_; } - int bigram_ancestry() const { return bigram_ancestry_; } - - int head_index() const { return head_index_; } - void set_head_index(int head_index) { head_index_ = head_index; } - - int sentence_index() const { return sentence_index_; } - void set_sentence_index(int sentence_index) { - sentence_index_ = sentence_index; - } - - int offset() const { return offset_; } - void set_offset(int offset) { offset_ = offset; } - - int global_start() const { return offset_ + start_; } - int global_end() const { return offset_ + end_; } - int global_head_index() { return offset_ + head_index_; } - - int speaker_id() const { return speaker_id_; } - void set_speaker_id(int speaker_id) { - speaker_id_ = speaker_id; - } - - int head_string_id() const { return head_string_id_; } - void set_head_string_id(int head_string_id) { - head_string_id_ = head_string_id; - } - - int phrase_string_id() const { return phrase_string_id_; } - void set_phrase_string_id(int phrase_string_id) { - phrase_string_id_ = phrase_string_id; - } - - const std::vector &all_word_string_ids() const { - return all_word_string_ids_; - } - void set_all_word_string_ids(const std::vector &all_word_string_ids) { - all_word_string_ids_ = all_word_string_ids; - } - - CoreferencePronoun *pronoun() const { return pronoun_; } - -public: - void ComputeProperties(const CoreferenceDictionary &dictionary, - CoreferenceSentence* instance, - CoreferenceSentenceNumeric *sentence); - - bool ContainsMentionHead(const Mention &mention) const { - const std::vector &all_word_string_ids = mention.all_word_string_ids(); - int head_string_id = all_word_string_ids[mention.head_index() - start_]; - for (int i = 0; i < all_word_string_ids_.size(); ++i) { - if (head_string_id == all_word_string_ids_[i]) return true; - } - return false; - } - - bool ContainsMentionString(const Mention &mention) const { - const std::vector &all_word_string_ids = mention.all_word_string_ids(); - int maximum_start = - all_word_string_ids_.size() - all_word_string_ids.size(); - for (int i = 0; i <= maximum_start; ++i) { - bool found_match = true; - for (int j = 0; j < all_word_string_ids.size(); ++j) { - if (all_word_string_ids[j] != all_word_string_ids_[i + j]) { - found_match = false; - break; - } - } - if (found_match) return true; - } - return false; - } - - // Print debug information about this mention. - void Print(const CoreferenceDictionary &dictionary, - CoreferenceSentence *instance); - void GetSpeaker(CoreferenceSentence *instance, - std::string *speaker); - void GetPhraseString(CoreferenceSentence *instance, - std::string *phrase_string); - void GetHeadString(CoreferenceSentence *instance, - std::string *head_string); - -protected: - void ComputeHead(); - int ComputeNumber(const std::vector &words, - const std::vector &words_lower, - int head_index) { - return MentionNumber::SINGULAR; - } - int ComputePersonGender(const std::vector &words, - const std::vector &words_lower, - int head_index) { - return MentionGender::MALE; - } - int ComputeNonPersonGender(const std::vector &words, - const std::vector &words_lower, - int head_index) { - return MentionGender::MALE; - } - -protected: - CoreferenceSentenceNumeric *sentence_; - int type_; // Type of mention (pronominal, proper, or nominal). - int entity_tag_; // Entity tag, if applicable (otherwise, -1). - CoreferencePronoun *pronoun_; // Pronoun information, if applicable. - int gender_; // Gender (male, female, neutral, unknown). - int number_; // Number (singular, plural, unknown). - int head_index_; // Position of the head word. - int unigram_ancestry_; // Dependency (syntactic) unigram ancestry. - int bigram_ancestry_; // Dependency (syntactic) bigram ancestry. - std::vector words_; // Mention words. - std::vector words_lower_; // Mention words in lower case. - std::vector tags_; // Mention POS tags. - int offset_; // Global offset position (start of sentence at document level). - int sentence_index_; // Index of the sentence to which this mention belongs. - int speaker_id_; - int head_string_id_; // ID of head word to test head match w/ other mentions. - int phrase_string_id_; // ID of the entire phrase to test exact match. - std::vector all_word_string_ids_; // IDs of the all mention words. -}; - -#endif /* MENTION_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef MENTION_H_ +#define MENTION_H_ + +#include "EntitySpan.h" +#include "CoreferenceDictionary.h" + +struct MentionType { + enum { + PRONOMINAL = 0, + PROPER, + NOMINAL, + NUM_MENTION_TYPES + }; +}; + +struct MentionGender { + enum { + MALE = 0, + FEMALE, + NEUTRAL, + UNKNOWN, + NUM_MENTION_GENDERS + }; +}; + +struct MentionNumber { + enum { + SINGULAR = 0, + PLURAL, + UNKNOWN, + NUM_MENTION_NUMBERS + }; +}; + +class CoreferenceSentenceNumeric; +class CoreferenceSentence; + +class Mention : public NumericSpan { +public: + Mention() { sentence_ = NULL; } + Mention(int start, int end, int id) : NumericSpan(start, end, id) {} + virtual ~Mention() {} + + int type() const { return type_; } + int gender() const { return gender_; } + int number() const { return number_; } + int unigram_ancestry() const { return unigram_ancestry_; } + int bigram_ancestry() const { return bigram_ancestry_; } + + int head_index() const { return head_index_; } + void set_head_index(int head_index) { head_index_ = head_index; } + + int sentence_index() const { return sentence_index_; } + void set_sentence_index(int sentence_index) { + sentence_index_ = sentence_index; + } + + int offset() const { return offset_; } + void set_offset(int offset) { offset_ = offset; } + + int global_start() const { return offset_ + start_; } + int global_end() const { return offset_ + end_; } + int global_head_index() { return offset_ + head_index_; } + + int speaker_id() const { return speaker_id_; } + void set_speaker_id(int speaker_id) { + speaker_id_ = speaker_id; + } + + int head_string_id() const { return head_string_id_; } + void set_head_string_id(int head_string_id) { + head_string_id_ = head_string_id; + } + + int phrase_string_id() const { return phrase_string_id_; } + void set_phrase_string_id(int phrase_string_id) { + phrase_string_id_ = phrase_string_id; + } + + const std::vector &all_word_string_ids() const { + return all_word_string_ids_; + } + void set_all_word_string_ids(const std::vector &all_word_string_ids) { + all_word_string_ids_ = all_word_string_ids; + } + + CoreferencePronoun *pronoun() const { return pronoun_; } + +public: + void ComputeProperties(const CoreferenceDictionary &dictionary, + CoreferenceSentence* instance, + CoreferenceSentenceNumeric *sentence); + + bool ContainsMentionHead(const Mention &mention) const { + const std::vector &all_word_string_ids = mention.all_word_string_ids(); + int head_string_id = all_word_string_ids[mention.head_index() - start_]; + for (int i = 0; i < all_word_string_ids_.size(); ++i) { + if (head_string_id == all_word_string_ids_[i]) return true; + } + return false; + } + + bool ContainsMentionString(const Mention &mention) const { + const std::vector &all_word_string_ids = mention.all_word_string_ids(); + int maximum_start = + (int)(all_word_string_ids_.size() - all_word_string_ids.size()); + for (int i = 0; i <= maximum_start; ++i) { + bool found_match = true; + for (int j = 0; j < all_word_string_ids.size(); ++j) { + if (all_word_string_ids[j] != all_word_string_ids_[i + j]) { + found_match = false; + break; + } + } + if (found_match) return true; + } + return false; + } + + // Print debug information about this mention. + void Print(const CoreferenceDictionary &dictionary, + CoreferenceSentence *instance); + void GetSpeaker(CoreferenceSentence *instance, + std::string *speaker); + void GetPhraseString(CoreferenceSentence *instance, + std::string *phrase_string); + void GetHeadString(CoreferenceSentence *instance, + std::string *head_string); + +protected: + void ComputeHead(); + int ComputeNumber(const std::vector &words, + const std::vector &words_lower, + int head_index) { + return MentionNumber::SINGULAR; + } + int ComputePersonGender(const std::vector &words, + const std::vector &words_lower, + int head_index) { + return MentionGender::MALE; + } + int ComputeNonPersonGender(const std::vector &words, + const std::vector &words_lower, + int head_index) { + return MentionGender::MALE; + } + +protected: + CoreferenceSentenceNumeric *sentence_; + int type_; // Type of mention (pronominal, proper, or nominal). + int entity_tag_; // Entity tag, if applicable (otherwise, -1). + CoreferencePronoun *pronoun_; // Pronoun information, if applicable. + int gender_; // Gender (male, female, neutral, unknown). + int number_; // Number (singular, plural, unknown). + int head_index_; // Position of the head word. + int unigram_ancestry_; // Dependency (syntactic) unigram ancestry. + int bigram_ancestry_; // Dependency (syntactic) bigram ancestry. + std::vector words_; // Mention words. + std::vector words_lower_; // Mention words in lower case. + std::vector tags_; // Mention POS tags. + int offset_; // Global offset position (start of sentence at document level). + int sentence_index_; // Index of the sentence to which this mention belongs. + int speaker_id_; + int head_string_id_; // ID of head word to test head match w/ other mentions. + int phrase_string_id_; // ID of the entire phrase to test exact match. + std::vector all_word_string_ids_; // IDs of the all mention words. +}; + +#endif /* MENTION_H_ */ diff --git a/src/coreference_resolver/TurboCoreferenceResolver.cpp b/src/coreference_resolver/TurboCoreferenceResolver.cpp index 5c95f30..772540b 100644 --- a/src/coreference_resolver/TurboCoreferenceResolver.cpp +++ b/src/coreference_resolver/TurboCoreferenceResolver.cpp @@ -1,101 +1,99 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "CoreferencePipe.h" - -void TrainCoreferenceResolver(); -void TestCoreferenceResolver(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_train) { - LOG(INFO) << "Training coreference resolver..." << endl; - TrainCoreferenceResolver(); - } else if (FLAGS_test) { - LOG(INFO) << "Running coreference resolver..." << endl; - TestCoreferenceResolver(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainCoreferenceResolver() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - CoreferenceOptions *options = new CoreferenceOptions; - options->Initialize(); - - CoreferencePipe *pipe = new CoreferencePipe(options); - pipe->Initialize(); - - LOG(INFO) << "Training the coreference resolver..."; - pipe->Train(); - pipe->SaveModelFile(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TestCoreferenceResolver() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - CoreferenceOptions *options = new CoreferenceOptions; - options->Initialize(); - - CoreferencePipe *pipe = new CoreferencePipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - pipe->Run(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "CoreferencePipe.h" + +void TrainCoreferenceResolver(); +void TestCoreferenceResolver(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_train) { + LOG(INFO) << "Training coreference resolver..." << endl; + TrainCoreferenceResolver(); + } else if (FLAGS_test) { + LOG(INFO) << "Running coreference resolver..." << endl; + TestCoreferenceResolver(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainCoreferenceResolver() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + CoreferenceOptions *options = new CoreferenceOptions; + options->Initialize(); + + CoreferencePipe *pipe = new CoreferencePipe(options); + pipe->Initialize(); + + LOG(INFO) << "Training the coreference resolver..."; + pipe->Train(); + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestCoreferenceResolver() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + CoreferenceOptions *options = new CoreferenceOptions; + options->Initialize(); + + CoreferencePipe *pipe = new CoreferencePipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/dependency_labeler/DependencyLabelerPipe.cpp b/src/dependency_labeler/DependencyLabelerPipe.cpp index 54af8cd..34d9b93 100644 --- a/src/dependency_labeler/DependencyLabelerPipe.cpp +++ b/src/dependency_labeler/DependencyLabelerPipe.cpp @@ -1,528 +1,526 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "DependencyLabelerPipe.h" -#include -#include -#include -#ifdef _WIN32 -#include -#else -#include -#endif - -// Define the current model version and the oldest back-compatible version. -// The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0". -const uint64_t kDependencyLabelerModelVersion = 200030000; -const uint64_t kOldestCompatibleDependencyLabelerModelVersion = 200030000; -const uint64_t kDependencyLabelerModelCheck = 1234567890; - -void DependencyLabelerPipe::SaveModel(FILE* fs) { - bool success; - success = WriteUINT64(fs, kDependencyLabelerModelCheck); - CHECK(success); - success = WriteUINT64(fs, kDependencyLabelerModelVersion); - CHECK(success); - token_dictionary_->Save(fs); - Pipe::SaveModel(fs); - //pruner_parameters_->Save(fs); -} - -void DependencyLabelerPipe::LoadModel(FILE* fs) { - bool success; - uint64_t model_check; - uint64_t model_version; - success = ReadUINT64(fs, &model_check); - CHECK(success); - CHECK_EQ(model_check, kDependencyLabelerModelCheck) - << "The model file is too old and not supported anymore."; - success = ReadUINT64(fs, &model_version); - CHECK(success); - CHECK_GE(model_version, kOldestCompatibleDependencyLabelerModelVersion) - << "The model file is too old and not supported anymore."; - delete token_dictionary_; - CreateTokenDictionary(); - static_cast(dictionary_)-> - SetTokenDictionary(token_dictionary_); - token_dictionary_->Load(fs); - Pipe::LoadModel(fs); - //pruner_parameters_->Load(fs); -} - -void DependencyLabelerPipe::PreprocessData() { - delete token_dictionary_; - CreateTokenDictionary(); - static_cast(dictionary_)->SetTokenDictionary(token_dictionary_); - static_cast(token_dictionary_)->Initialize(GetDependencyReader()); - static_cast(dictionary_)->CreateLabelDictionary(GetDependencyReader()); -} - -void DependencyLabelerPipe::ComputeScores(Instance *instance, Parts *parts, - Features *features, - std::vector *scores) { - //LOG(INFO) << "ComputeScores"; - Parameters *parameters = parameters_; - scores->resize(parts->size()); - DependencyLabelerParts *dependency_parts = - static_cast(parts); - DependencyLabelerFeatures *dependency_features = - static_cast(features); - DependencyInstanceNumeric *sentence = - static_cast(instance); - DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); - const std::vector &heads = sentence->GetHeads(); - const std::vector > &siblings = dependency_parts->siblings(); - - for (int m = 1; m < sentence->size(); ++m) { - // Conjoin arc features with the label. - const BinaryFeatures &arc_features = dependency_features->GetArcFeatures(m); - const std::vector &index_arc_parts = - dependency_parts->FindArcs(m); - std::vector allowed_labels(index_arc_parts.size()); - for (int k = 0; k < index_arc_parts.size(); ++k) { - DependencyLabelerPartArc *arc = - static_cast((*parts)[index_arc_parts[k]]); - allowed_labels[k] = arc->label(); - } - std::vector label_scores; - parameters_->ComputeLabelScores(arc_features, allowed_labels, - &label_scores); - for (int k = 0; k < index_arc_parts.size(); ++k) { - (*scores)[index_arc_parts[k]] = label_scores[k]; - } - } - - if (dependency_options->use_sibling_parts()) { - for (int h = 0; h < sentence->size(); ++h) { - if (siblings[h].size() == 0) continue; - for (int i = 0; i < siblings[h].size() + 1; ++i) { - const BinaryFeatures &sibling_features = - dependency_features->GetSiblingFeatures(h, i); - const std::vector &index_sibling_parts = - dependency_parts->FindSiblings(h, i); - std::vector sibling_labels(index_sibling_parts.size()); - for (int k = 0; k < index_sibling_parts.size(); ++k) { - DependencyLabelerPartSibling *sibling = - static_cast( - (*parts)[index_sibling_parts[k]]); - sibling_labels[k] = GetSiblingLabel(sibling->sibling_label(), - sibling->modifier_label()); - } - std::vector label_scores; - parameters_->ComputeLabelScores(sibling_features, sibling_labels, - &label_scores); - for (int k = 0; k < index_sibling_parts.size(); ++k) { - (*scores)[index_sibling_parts[k]] = label_scores[k]; - } - } - } - } - //LOG(INFO) << "End ComputeScores"; -} - -void DependencyLabelerPipe::MakeGradientStep( - Parts *parts, - Features *features, - double eta, - int iteration, - const std::vector &gold_output, - const std::vector &predicted_output) { - //LOG(INFO) << "MakeGradientStep"; - DependencyLabelerParts *dependency_parts = - static_cast(parts); - DependencyLabelerFeatures *dependency_features = - static_cast(features); - Parameters *parameters = GetTrainingParameters(); - DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); - - for (int r = 0; r < parts->size(); ++r) { - if (predicted_output[r] == gold_output[r]) continue; - - // Labeled arcs will be treated by looking at the unlabeled arcs and - // conjoining with the label. - if ((*parts)[r]->type() == DEPENDENCYLABELERPART_ARC) { - DependencyLabelerPartArc *arc = - static_cast((*parts)[r]); - const BinaryFeatures &arc_features = - dependency_features->GetArcFeatures(arc->modifier()); - - parameters->MakeLabelGradientStep(arc_features, eta, iteration, - arc->label(), - predicted_output[r] - gold_output[r]); - } else if ((*parts)[r]->type() == DEPENDENCYLABELERPART_SIBLING) { - DependencyLabelerPartSibling *sibling = - static_cast((*parts)[r]); - int sibling_index = dependency_parts-> - GetSiblingIndex(sibling->head(), sibling->modifier()); - const BinaryFeatures &sibling_features = - dependency_features->GetSiblingFeatures(sibling->head(), - sibling_index); - int sibling_label = GetSiblingLabel(sibling->sibling_label(), - sibling->modifier_label()); - parameters->MakeLabelGradientStep(sibling_features, eta, iteration, - sibling_label, - predicted_output[r] - gold_output[r]); - } else { - CHECK(false); - } - } - //LOG(INFO) << "End MakeGradientStep"; -} - -void DependencyLabelerPipe::MakeFeatureDifference( - Parts *parts, - Features *features, - const std::vector &gold_output, - const std::vector &predicted_output, - FeatureVector *difference) { - //LOG(INFO) << "MakeFeatureDifference"; - DependencyLabelerParts *dependency_parts = - static_cast(parts); - DependencyLabelerFeatures *dependency_features = - static_cast(features); - DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); - - for (int r = 0; r < parts->size(); ++r) { - if (predicted_output[r] == gold_output[r]) continue; - - // Labeled arcs will be treated by looking at the unlabeled arcs and - // conjoining with the label. - if ((*parts)[r]->type() == DEPENDENCYLABELERPART_ARC) { - DependencyLabelerPartArc *arc = - static_cast((*parts)[r]); - const BinaryFeatures &arc_features = - dependency_features->GetArcFeatures(arc->modifier()); - for (int j = 0; j < arc_features.size(); ++j) { - difference->mutable_labeled_weights()->Add(arc_features[j], - arc->label(), predicted_output[r] - gold_output[r]); - } - } else if ((*parts)[r]->type() == DEPENDENCYLABELERPART_SIBLING) { - DependencyLabelerPartSibling *sibling = - static_cast((*parts)[r]); - int sibling_index = dependency_parts-> - GetSiblingIndex(sibling->head(), sibling->modifier()); - const BinaryFeatures &sibling_features = - dependency_features->GetSiblingFeatures(sibling->head(), - sibling_index); - int sibling_label = GetSiblingLabel(sibling->sibling_label(), - sibling->modifier_label()); - for (int j = 0; j < sibling_features.size(); ++j) { - difference->mutable_labeled_weights()->Add(sibling_features[j], - sibling_label, predicted_output[r] - gold_output[r]); - } - } else { - CHECK(false); - } - } - //LOG(INFO) << "End MakeFeatureDifference"; -} - -void DependencyLabelerPipe::MakeParts(Instance *instance, - Parts *parts, - std::vector *gold_outputs) { - DependencyInstanceNumeric *sentence = - static_cast(instance); - DependencyLabelerParts *dependency_parts = - static_cast(parts); - dependency_parts->Initialize(); - bool make_gold = (gold_outputs != NULL); - if (make_gold) gold_outputs->clear(); - - // Make labeled arc parts and compute indices. - MakeArcParts(instance, parts, gold_outputs); - dependency_parts->BuildArcIndices(sentence->GetHeads()); - dependency_parts->ComputeSiblings(sentence->GetHeads()); - - // Make sibling parts. - if (GetDependencyLabelerOptions()->use_sibling_parts()) { - MakeSiblingParts(instance, parts, gold_outputs); - dependency_parts->BuildSiblingIndices(sentence->GetHeads()); - } - - dependency_parts->BuildOffsets(); -} - -void DependencyLabelerPipe::MakeArcParts(Instance *instance, - Parts *parts, - std::vector *gold_outputs) { - DependencyInstanceNumeric *sentence = - static_cast(instance); - DependencyLabelerParts *dependency_parts = - static_cast(parts); - DependencyDictionary *dependency_dictionary = GetDependencyDictionary(); - DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - bool prune_labels = dependency_options->prune_labels(); - vector allowed_labels; - - if (!prune_labels) { - allowed_labels.resize(dependency_dictionary->GetLabelAlphabet().size()); - for (int i = 0; i < allowed_labels.size(); ++i) { - allowed_labels[i] = i; - } - } - - const vector &heads = sentence->GetHeads(); - int num_parts_initial = dependency_parts->size(); - - // Add parts for the labeled arcs. - num_parts_initial = dependency_parts->size(); - for (int m = 1; m < sentence_length; ++m) { - int h = heads[m]; - if (prune_labels) { - int modifier_pos_id = sentence->GetPosId(m); - int head_pos_id = sentence->GetPosId(h); - allowed_labels.clear(); - allowed_labels = dependency_dictionary-> - GetExistingLabels(modifier_pos_id, head_pos_id); - } - - // If there is no allowed label for this arc, but the unlabeled arc was - // added, consider all the possible labels. - if (allowed_labels.empty()) { - allowed_labels.resize(dependency_dictionary->GetLabelAlphabet().size()); - for (int l = 0; l < allowed_labels.size(); ++l) { - allowed_labels[l] = l; - } - } - for (int k = 0; k < allowed_labels.size(); ++k) { - int l = allowed_labels[k]; - Part *part = dependency_parts->CreatePartArc(h, m, l); - dependency_parts->push_back(part); - if (make_gold) { - if (sentence->GetRelationId(m) == l) { - gold_outputs->push_back(1.0); - } else { - gold_outputs->push_back(0.0); - } - } - } - } - - dependency_parts->SetOffsetArc(num_parts_initial, - dependency_parts->size() - num_parts_initial); -} - -void DependencyLabelerPipe::MakeSiblingParts( - Instance *instance, - Parts *parts, - std::vector *gold_outputs) { - //LOG(INFO) << "MakeSiblingParts"; - DependencyInstanceNumeric *sentence = - static_cast(instance); - DependencyLabelerParts *dependency_parts = - static_cast(parts); - DependencyDictionary *dependency_dictionary = GetDependencyDictionary(); - DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - bool prune_labels = dependency_options->prune_labels(); - vector allowed_labels; - const vector &heads = sentence->GetHeads(); - - int num_parts_initial = dependency_parts->size(); - - const std::vector > &siblings = dependency_parts->siblings(); - - for (int h = 0; h < sentence_length; ++h) { - // Don't create parts for heads without modifiers. - if (siblings[h].size() == 0) continue; - - // Start position. - int m = siblings[h][0]; - const std::vector& initial_parts = dependency_parts->FindArcs(m); - for (int j = 0; j < initial_parts.size(); ++j) { - DependencyLabelerPartArc *initial_part = - static_cast( - (*dependency_parts)[initial_parts[j]]); - // TODO: Don't create a bigram part if this bigram is not allowed. - Part *part = dependency_parts-> - CreatePartSibling(h, m, -1, initial_part->label(), -1); - dependency_parts->push_back(part); - if (make_gold) { - gold_outputs->push_back((*gold_outputs)[initial_parts[j]]); - } - } - - // Intermediate position. - for (int i = 1; i < siblings[h].size(); ++i) { - int m = siblings[h][i]; - int s = siblings[h][i - 1]; - const std::vector& current_parts = - dependency_parts->FindArcs(m); - const std::vector& previous_parts = - dependency_parts->FindArcs(s); - for (int j = 0; j < current_parts.size(); ++j) { - DependencyLabelerPartArc *current_part = - static_cast( - (*dependency_parts)[current_parts[j]]); - for (int k = 0; k < previous_parts.size(); ++k) { - DependencyLabelerPartArc *previous_part = - static_cast( - (*dependency_parts)[previous_parts[k]]); - // TODO: Don't create a bigram part if this bigram is not allowed. - Part *part = dependency_parts-> - CreatePartSibling(h, m, s, current_part->label(), - previous_part->label()); - dependency_parts->push_back(part); - if (make_gold) { - gold_outputs->push_back( - (*gold_outputs)[current_parts[j]] * - (*gold_outputs)[previous_parts[k]]); - } - } - } - } - - // Final position. - m = siblings[h][siblings[h].size() - 1]; - const std::vector& final_parts = - dependency_parts->FindArcs(m); - for (int j = 0; j < final_parts.size(); ++j) { - DependencyLabelerPartArc *final_part = - static_cast( - (*dependency_parts)[final_parts[j]]); - // TODO: Don't create a bigram part if this bigram is not allowed. - Part *part = dependency_parts-> - CreatePartSibling(h, -1, m, -1, final_part->label()); - dependency_parts->push_back(part); - if (make_gold) { - gold_outputs->push_back((*gold_outputs)[final_parts[j]]); - } - } - } - - dependency_parts->SetOffsetSibling( - num_parts_initial, - dependency_parts->size() - num_parts_initial); - //LOG(INFO) << "End MakeSiblingParts"; -} - -void DependencyLabelerPipe::MakeSelectedFeatures( - Instance *instance, - Parts *parts, - const std::vector& selected_parts, - Features *features) { - //LOG(INFO) << "MakeSelectedFeatures"; - DependencyInstanceNumeric *sentence = - static_cast(instance); - DependencyLabelerParts *dependency_parts = - static_cast(parts); - DependencyLabelerFeatures *dependency_features = - static_cast(features); - int sentence_length = sentence->size(); - - // TODO(atm): make this computation of descendents be part of - // DependencyInstanceNumeric or a class that derives from it. - std::vector > descendents; - const std::vector &heads = sentence->GetHeads(); - ComputeDescendents(heads, &descendents); - - const std::vector > &siblings = dependency_parts->siblings(); - dependency_features->Initialize(instance, parts, siblings); - - // Build features for arcs/siblings only. They will later be conjoined with - // the tags. - for (int m = 1; m < sentence_length; ++m) { - dependency_features->AddArcFeatures(sentence, descendents, - siblings, m); - } - - // Make sibling parts. - if (GetDependencyLabelerOptions()->use_sibling_parts()) { - for (int h = 0; h < sentence_length; ++h) { - if (siblings[h].size() == 0) continue; - for (int i = 0; i < siblings[h].size() + 1; ++i) { - dependency_features->AddSiblingFeatures(sentence, descendents, - siblings, h, i); - } - } - } - //LOG(INFO) << "End MakeSelectedFeatures"; -} - -void DependencyLabelerPipe::LabelInstance(Parts *parts, - const std::vector &output, - Instance *instance) { - DependencyLabelerParts *dependency_parts = - static_cast(parts); - DependencyInstance *dependency_instance = - static_cast(instance); - int instance_length = dependency_instance->size(); - for (int m = 0; m < instance_length; ++m) { - dependency_instance->SetHead(m, -1); - dependency_instance->SetDependencyRelation(m, "NULL"); - } - double threshold = 0.5; - - int offset, num_labeled_arcs; - dependency_parts->GetOffsetArc(&offset, &num_labeled_arcs); - for (int r = 0; r < num_labeled_arcs; ++r) { - DependencyLabelerPartArc *arc = - static_cast((*dependency_parts)[offset + r]); - if (output[offset + r] >= threshold) { - dependency_instance->SetHead(arc->modifier(), arc->head()); - dependency_instance->SetDependencyRelation(arc->modifier(), - GetDependencyDictionary()-> - GetLabelName(arc->label())); - } - } - - for (int m = 1; m < instance_length; ++m) { - if (dependency_instance->GetHead(m) < 0) { - VLOG(2) << "Word without head."; - dependency_instance->SetHead(m, 0); - dependency_instance->SetDependencyRelation(m, - GetDependencyDictionary()->GetLabelName(0)); - } - } -} - -void DependencyLabelerPipe::ComputeDescendents( - const std::vector &heads, - std::vector >* descendents) const { - //LOG(INFO) << "Computing descendents"; - descendents->resize(heads.size()); - for (int h = 0; h < descendents->size(); ++h) { - (*descendents)[h].clear(); - } - for (int m = 1; m < heads.size(); ++m) { - (*descendents)[m].push_back(m); - std::vector ancestors; - GetAllAncestors(heads, m, &ancestors); - for (int k = 0; k < ancestors.size(); ++k) { - int h = ancestors[k]; - CHECK_GE(h, 0); - //LOG(INFO) << h << " " << descendents->size(); - (*descendents)[h].push_back(m); - } - } - //LOG(INFO) << "End computing descendents"; -} - -void DependencyLabelerPipe::GetAllAncestors(const std::vector &heads, - int descend, - std::vector* ancestors) const { - ancestors->clear(); - int h = heads[descend]; - while (h >= 0) { - ancestors->push_back(h); - h = heads[h]; - } -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "DependencyLabelerPipe.h" +#include +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif + +// Define the current model version and the oldest back-compatible version. +// The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0". +const uint64_t kDependencyLabelerModelVersion = 200030000; +const uint64_t kOldestCompatibleDependencyLabelerModelVersion = 200030000; +const uint64_t kDependencyLabelerModelCheck = 1234567890; + +void DependencyLabelerPipe::SaveModel(FILE* fs) { + bool success; + success = WriteUINT64(fs, kDependencyLabelerModelCheck); + CHECK(success); + success = WriteUINT64(fs, kDependencyLabelerModelVersion); + CHECK(success); + token_dictionary_->Save(fs); + Pipe::SaveModel(fs); + //pruner_parameters_->Save(fs); +} + +void DependencyLabelerPipe::LoadModel(FILE* fs) { + bool success; + success = ReadUINT64(fs, &model_check_); + CHECK(success); + CHECK_EQ(model_check_, kDependencyLabelerModelCheck) + << "The model file is too old and not supported anymore."; + success = ReadUINT64(fs, &model_version_); + CHECK(success); + CHECK_GE(model_version_, kOldestCompatibleDependencyLabelerModelVersion) + << "The model file is too old and not supported anymore."; + delete token_dictionary_; + CreateTokenDictionary(); + static_cast(dictionary_)-> + SetTokenDictionary(token_dictionary_); + token_dictionary_->Load(fs); + Pipe::LoadModel(fs); + //pruner_parameters_->Load(fs); +} + +void DependencyLabelerPipe::PreprocessData() { + delete token_dictionary_; + CreateTokenDictionary(); + static_cast(dictionary_)->SetTokenDictionary(token_dictionary_); + static_cast(token_dictionary_)->Initialize(GetDependencyReader()); + static_cast(dictionary_)->CreateLabelDictionary(GetDependencyReader()); +} + +void DependencyLabelerPipe::ComputeScores(Instance *instance, Parts *parts, + Features *features, + std::vector *scores) { + //LOG(INFO) << "ComputeScores"; + Parameters *parameters = parameters_; + scores->resize(parts->size()); + DependencyLabelerParts *dependency_parts = + static_cast(parts); + DependencyLabelerFeatures *dependency_features = + static_cast(features); + DependencyInstanceNumeric *sentence = + static_cast(instance); + DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); + const std::vector &heads = sentence->GetHeads(); + const std::vector > &siblings = dependency_parts->siblings(); + + for (int m = 1; m < sentence->size(); ++m) { + // Conjoin arc features with the label. + const BinaryFeatures &arc_features = dependency_features->GetArcFeatures(m); + const std::vector &index_arc_parts = + dependency_parts->FindArcs(m); + std::vector allowed_labels(index_arc_parts.size()); + for (int k = 0; k < index_arc_parts.size(); ++k) { + DependencyLabelerPartArc *arc = + static_cast((*parts)[index_arc_parts[k]]); + allowed_labels[k] = arc->label(); + } + std::vector label_scores; + parameters_->ComputeLabelScores(arc_features, allowed_labels, + &label_scores); + for (int k = 0; k < index_arc_parts.size(); ++k) { + (*scores)[index_arc_parts[k]] = label_scores[k]; + } + } + + if (dependency_options->use_sibling_parts()) { + for (int h = 0; h < sentence->size(); ++h) { + if (siblings[h].size() == 0) continue; + for (int i = 0; i < siblings[h].size() + 1; ++i) { + const BinaryFeatures &sibling_features = + dependency_features->GetSiblingFeatures(h, i); + const std::vector &index_sibling_parts = + dependency_parts->FindSiblings(h, i); + std::vector sibling_labels(index_sibling_parts.size()); + for (int k = 0; k < index_sibling_parts.size(); ++k) { + DependencyLabelerPartSibling *sibling = + static_cast( + (*parts)[index_sibling_parts[k]]); + sibling_labels[k] = GetSiblingLabel(sibling->sibling_label(), + sibling->modifier_label()); + } + std::vector label_scores; + parameters_->ComputeLabelScores(sibling_features, sibling_labels, + &label_scores); + for (int k = 0; k < index_sibling_parts.size(); ++k) { + (*scores)[index_sibling_parts[k]] = label_scores[k]; + } + } + } + } + //LOG(INFO) << "End ComputeScores"; +} + +void DependencyLabelerPipe::MakeGradientStep( + Parts *parts, + Features *features, + double eta, + int iteration, + const std::vector &gold_output, + const std::vector &predicted_output) { + //LOG(INFO) << "MakeGradientStep"; + DependencyLabelerParts *dependency_parts = + static_cast(parts); + DependencyLabelerFeatures *dependency_features = + static_cast(features); + Parameters *parameters = GetTrainingParameters(); + DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); + + for (int r = 0; r < parts->size(); ++r) { + if (predicted_output[r] == gold_output[r]) continue; + + // Labeled arcs will be treated by looking at the unlabeled arcs and + // conjoining with the label. + if ((*parts)[r]->type() == DEPENDENCYLABELERPART_ARC) { + DependencyLabelerPartArc *arc = + static_cast((*parts)[r]); + const BinaryFeatures &arc_features = + dependency_features->GetArcFeatures(arc->modifier()); + + parameters->MakeLabelGradientStep(arc_features, eta, iteration, + arc->label(), + predicted_output[r] - gold_output[r]); + } else if ((*parts)[r]->type() == DEPENDENCYLABELERPART_SIBLING) { + DependencyLabelerPartSibling *sibling = + static_cast((*parts)[r]); + int sibling_index = dependency_parts-> + GetSiblingIndex(sibling->head(), sibling->modifier()); + const BinaryFeatures &sibling_features = + dependency_features->GetSiblingFeatures(sibling->head(), + sibling_index); + int sibling_label = GetSiblingLabel(sibling->sibling_label(), + sibling->modifier_label()); + parameters->MakeLabelGradientStep(sibling_features, eta, iteration, + sibling_label, + predicted_output[r] - gold_output[r]); + } else { + CHECK(false); + } + } + //LOG(INFO) << "End MakeGradientStep"; +} + +void DependencyLabelerPipe::MakeFeatureDifference( + Parts *parts, + Features *features, + const std::vector &gold_output, + const std::vector &predicted_output, + FeatureVector *difference) { + //LOG(INFO) << "MakeFeatureDifference"; + DependencyLabelerParts *dependency_parts = + static_cast(parts); + DependencyLabelerFeatures *dependency_features = + static_cast(features); + DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); + + for (int r = 0; r < parts->size(); ++r) { + if (predicted_output[r] == gold_output[r]) continue; + + // Labeled arcs will be treated by looking at the unlabeled arcs and + // conjoining with the label. + if ((*parts)[r]->type() == DEPENDENCYLABELERPART_ARC) { + DependencyLabelerPartArc *arc = + static_cast((*parts)[r]); + const BinaryFeatures &arc_features = + dependency_features->GetArcFeatures(arc->modifier()); + for (int j = 0; j < arc_features.size(); ++j) { + difference->mutable_labeled_weights()->Add(arc_features[j], + arc->label(), predicted_output[r] - gold_output[r]); + } + } else if ((*parts)[r]->type() == DEPENDENCYLABELERPART_SIBLING) { + DependencyLabelerPartSibling *sibling = + static_cast((*parts)[r]); + int sibling_index = dependency_parts-> + GetSiblingIndex(sibling->head(), sibling->modifier()); + const BinaryFeatures &sibling_features = + dependency_features->GetSiblingFeatures(sibling->head(), + sibling_index); + int sibling_label = GetSiblingLabel(sibling->sibling_label(), + sibling->modifier_label()); + for (int j = 0; j < sibling_features.size(); ++j) { + difference->mutable_labeled_weights()->Add(sibling_features[j], + sibling_label, predicted_output[r] - gold_output[r]); + } + } else { + CHECK(false); + } + } + //LOG(INFO) << "End MakeFeatureDifference"; +} + +void DependencyLabelerPipe::MakeParts(Instance *instance, + Parts *parts, + std::vector *gold_outputs) { + DependencyInstanceNumeric *sentence = + static_cast(instance); + DependencyLabelerParts *dependency_parts = + static_cast(parts); + dependency_parts->Initialize(); + bool make_gold = (gold_outputs != NULL); + if (make_gold) gold_outputs->clear(); + + // Make labeled arc parts and compute indices. + MakeArcParts(instance, parts, gold_outputs); + dependency_parts->BuildArcIndices(sentence->GetHeads()); + dependency_parts->ComputeSiblings(sentence->GetHeads()); + + // Make sibling parts. + if (GetDependencyLabelerOptions()->use_sibling_parts()) { + MakeSiblingParts(instance, parts, gold_outputs); + dependency_parts->BuildSiblingIndices(sentence->GetHeads()); + } + + dependency_parts->BuildOffsets(); +} + +void DependencyLabelerPipe::MakeArcParts(Instance *instance, + Parts *parts, + std::vector *gold_outputs) { + DependencyInstanceNumeric *sentence = + static_cast(instance); + DependencyLabelerParts *dependency_parts = + static_cast(parts); + DependencyDictionary *dependency_dictionary = GetDependencyDictionary(); + DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + bool prune_labels = dependency_options->prune_labels(); + vector allowed_labels; + + if (!prune_labels) { + allowed_labels.resize(dependency_dictionary->GetLabelAlphabet().size()); + for (int i = 0; i < allowed_labels.size(); ++i) { + allowed_labels[i] = i; + } + } + + const vector &heads = sentence->GetHeads(); + int num_parts_initial = dependency_parts->size(); + + // Add parts for the labeled arcs. + num_parts_initial = dependency_parts->size(); + for (int m = 1; m < sentence_length; ++m) { + int h = heads[m]; + if (prune_labels) { + int modifier_pos_id = sentence->GetPosId(m); + int head_pos_id = sentence->GetPosId(h); + allowed_labels.clear(); + allowed_labels = dependency_dictionary-> + GetExistingLabels(modifier_pos_id, head_pos_id); + } + + // If there is no allowed label for this arc, but the unlabeled arc was + // added, consider all the possible labels. + if (allowed_labels.empty()) { + allowed_labels.resize(dependency_dictionary->GetLabelAlphabet().size()); + for (int l = 0; l < allowed_labels.size(); ++l) { + allowed_labels[l] = l; + } + } + for (int k = 0; k < allowed_labels.size(); ++k) { + int l = allowed_labels[k]; + Part *part = dependency_parts->CreatePartArc(h, m, l); + dependency_parts->push_back(part); + if (make_gold) { + if (sentence->GetRelationId(m) == l) { + gold_outputs->push_back(1.0); + } else { + gold_outputs->push_back(0.0); + } + } + } + } + + dependency_parts->SetOffsetArc(num_parts_initial, + dependency_parts->size() - num_parts_initial); +} + +void DependencyLabelerPipe::MakeSiblingParts( + Instance *instance, + Parts *parts, + std::vector *gold_outputs) { + //LOG(INFO) << "MakeSiblingParts"; + DependencyInstanceNumeric *sentence = + static_cast(instance); + DependencyLabelerParts *dependency_parts = + static_cast(parts); + DependencyDictionary *dependency_dictionary = GetDependencyDictionary(); + DependencyLabelerOptions *dependency_options = GetDependencyLabelerOptions(); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + bool prune_labels = dependency_options->prune_labels(); + vector allowed_labels; + const vector &heads = sentence->GetHeads(); + + int num_parts_initial = dependency_parts->size(); + + const std::vector > &siblings = dependency_parts->siblings(); + + for (int h = 0; h < sentence_length; ++h) { + // Don't create parts for heads without modifiers. + if (siblings[h].size() == 0) continue; + + // Start position. + int m = siblings[h][0]; + const std::vector& initial_parts = dependency_parts->FindArcs(m); + for (int j = 0; j < initial_parts.size(); ++j) { + DependencyLabelerPartArc *initial_part = + static_cast( + (*dependency_parts)[initial_parts[j]]); + // TODO: Don't create a bigram part if this bigram is not allowed. + Part *part = dependency_parts-> + CreatePartSibling(h, m, -1, initial_part->label(), -1); + dependency_parts->push_back(part); + if (make_gold) { + gold_outputs->push_back((*gold_outputs)[initial_parts[j]]); + } + } + + // Intermediate position. + for (int i = 1; i < siblings[h].size(); ++i) { + int m = siblings[h][i]; + int s = siblings[h][i - 1]; + const std::vector& current_parts = + dependency_parts->FindArcs(m); + const std::vector& previous_parts = + dependency_parts->FindArcs(s); + for (int j = 0; j < current_parts.size(); ++j) { + DependencyLabelerPartArc *current_part = + static_cast( + (*dependency_parts)[current_parts[j]]); + for (int k = 0; k < previous_parts.size(); ++k) { + DependencyLabelerPartArc *previous_part = + static_cast( + (*dependency_parts)[previous_parts[k]]); + // TODO: Don't create a bigram part if this bigram is not allowed. + Part *part = dependency_parts-> + CreatePartSibling(h, m, s, current_part->label(), + previous_part->label()); + dependency_parts->push_back(part); + if (make_gold) { + gold_outputs->push_back( + (*gold_outputs)[current_parts[j]] * + (*gold_outputs)[previous_parts[k]]); + } + } + } + } + + // Final position. + m = siblings[h][siblings[h].size() - 1]; + const std::vector& final_parts = + dependency_parts->FindArcs(m); + for (int j = 0; j < final_parts.size(); ++j) { + DependencyLabelerPartArc *final_part = + static_cast( + (*dependency_parts)[final_parts[j]]); + // TODO: Don't create a bigram part if this bigram is not allowed. + Part *part = dependency_parts-> + CreatePartSibling(h, -1, m, -1, final_part->label()); + dependency_parts->push_back(part); + if (make_gold) { + gold_outputs->push_back((*gold_outputs)[final_parts[j]]); + } + } + } + + dependency_parts->SetOffsetSibling( + num_parts_initial, + dependency_parts->size() - num_parts_initial); + //LOG(INFO) << "End MakeSiblingParts"; +} + +void DependencyLabelerPipe::MakeSelectedFeatures( + Instance *instance, + Parts *parts, + const std::vector& selected_parts, + Features *features) { + //LOG(INFO) << "MakeSelectedFeatures"; + DependencyInstanceNumeric *sentence = + static_cast(instance); + DependencyLabelerParts *dependency_parts = + static_cast(parts); + DependencyLabelerFeatures *dependency_features = + static_cast(features); + int sentence_length = sentence->size(); + + // TODO(atm): make this computation of descendents be part of + // DependencyInstanceNumeric or a class that derives from it. + std::vector > descendents; + const std::vector &heads = sentence->GetHeads(); + ComputeDescendents(heads, &descendents); + + const std::vector > &siblings = dependency_parts->siblings(); + dependency_features->Initialize(instance, parts, siblings); + + // Build features for arcs/siblings only. They will later be conjoined with + // the tags. + for (int m = 1; m < sentence_length; ++m) { + dependency_features->AddArcFeatures(sentence, descendents, + siblings, m); + } + + // Make sibling parts. + if (GetDependencyLabelerOptions()->use_sibling_parts()) { + for (int h = 0; h < sentence_length; ++h) { + if (siblings[h].size() == 0) continue; + for (int i = 0; i < siblings[h].size() + 1; ++i) { + dependency_features->AddSiblingFeatures(sentence, descendents, + siblings, h, i); + } + } + } + //LOG(INFO) << "End MakeSelectedFeatures"; +} + +void DependencyLabelerPipe::LabelInstance(Parts *parts, + const std::vector &output, + Instance *instance) { + DependencyLabelerParts *dependency_parts = + static_cast(parts); + DependencyInstance *dependency_instance = + static_cast(instance); + int instance_length = dependency_instance->size(); + for (int m = 0; m < instance_length; ++m) { + dependency_instance->SetHead(m, -1); + dependency_instance->SetDependencyRelation(m, "NULL"); + } + double threshold = 0.5; + + int offset, num_labeled_arcs; + dependency_parts->GetOffsetArc(&offset, &num_labeled_arcs); + for (int r = 0; r < num_labeled_arcs; ++r) { + DependencyLabelerPartArc *arc = + static_cast((*dependency_parts)[offset + r]); + if (output[offset + r] >= threshold) { + dependency_instance->SetHead(arc->modifier(), arc->head()); + dependency_instance->SetDependencyRelation(arc->modifier(), + GetDependencyDictionary()-> + GetLabelName(arc->label())); + } + } + + for (int m = 1; m < instance_length; ++m) { + if (dependency_instance->GetHead(m) < 0) { + VLOG(2) << "Word without head."; + dependency_instance->SetHead(m, 0); + dependency_instance->SetDependencyRelation(m, + GetDependencyDictionary()->GetLabelName(0)); + } + } +} + +void DependencyLabelerPipe::ComputeDescendents( + const std::vector &heads, + std::vector >* descendents) const { + //LOG(INFO) << "Computing descendents"; + descendents->resize(heads.size()); + for (int h = 0; h < descendents->size(); ++h) { + (*descendents)[h].clear(); + } + for (int m = 1; m < heads.size(); ++m) { + (*descendents)[m].push_back(m); + std::vector ancestors; + GetAllAncestors(heads, m, &ancestors); + for (int k = 0; k < ancestors.size(); ++k) { + int h = ancestors[k]; + CHECK_GE(h, 0); + //LOG(INFO) << h << " " << descendents->size(); + (*descendents)[h].push_back(m); + } + } + //LOG(INFO) << "End computing descendents"; +} + +void DependencyLabelerPipe::GetAllAncestors(const std::vector &heads, + int descend, + std::vector* ancestors) const { + ancestors->clear(); + int h = heads[descend]; + while (h >= 0) { + ancestors->push_back(h); + h = heads[h]; + } +} diff --git a/src/dependency_labeler/DependencyLabelerPipe.h b/src/dependency_labeler/DependencyLabelerPipe.h index 10941e2..1a2e021 100644 --- a/src/dependency_labeler/DependencyLabelerPipe.h +++ b/src/dependency_labeler/DependencyLabelerPipe.h @@ -1,236 +1,235 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef DEPENDENCYLABELERPIPE_H_ -#define DEPENDENCYLABELERPIPE_H_ - -#include "Pipe.h" -#include "DependencyLabelerOptions.h" -#include "DependencyReader.h" -#include "DependencyDictionary.h" -#include "TokenDictionary.h" -#include "DependencyInstanceNumeric.h" -#include "DependencyWriter.h" -#include "DependencyLabelerPart.h" -#include "DependencyLabelerFeatures.h" -#include "DependencyLabelerDecoder.h" - -class DependencyLabelerPipe : public Pipe { -public: - DependencyLabelerPipe(Options* options) : Pipe(options) { - token_dictionary_ = NULL; - //pruner_parameters_ = NULL; - //train_pruner_ = false; - } - virtual ~DependencyLabelerPipe() { - //delete token_dictionary_; - //delete pruner_parameters_; - } - - // Same reader as the dependency parser. - DependencyReader *GetDependencyReader() { - return static_cast(reader_); - } - // Same dictionary as the dependency parser. - DependencyDictionary *GetDependencyDictionary() { - return static_cast(dictionary_); - } - DependencyLabelerDecoder *GetDependencyLabelerDecoder() { - return static_cast(decoder_); - } - DependencyLabelerOptions *GetDependencyLabelerOptions() { - return static_cast(options_); - } - - void Initialize() { - Pipe::Initialize(); - //pruner_parameters_ = new Parameters; - } - - //void SetPrunerParameters(Parameters *pruner_parameters) { - // pruner_parameters_ = pruner_parameters; - //} - //void LoadPrunerModelFile() { - // LoadPrunerModelByName(GetDependencyOptions()->GetPrunerModelFilePath()); - //} - -protected: - void CreateDictionary() { - dictionary_ = new DependencyDictionary(this); - GetDependencyDictionary()->SetTokenDictionary(token_dictionary_); - }; - void CreateReader() { reader_ = new DependencyReader; }; - void CreateWriter() { writer_ = new DependencyWriter; }; - void CreateDecoder() { decoder_ = new DependencyLabelerDecoder(this); }; - Parts *CreateParts() { return new DependencyLabelerParts; }; - Features *CreateFeatures() { return new DependencyLabelerFeatures(this); }; - - void CreateTokenDictionary() { - token_dictionary_ = new TokenDictionary(this); - }; - - Parameters *GetTrainingParameters() { - //if (train_pruner_) return pruner_parameters_; - return parameters_; - } - - void PreprocessData(); - - Instance *GetFormattedInstance(Instance *instance) { - DependencyInstanceNumeric *instance_numeric = - new DependencyInstanceNumeric; - instance_numeric->Initialize(*GetDependencyDictionary(), - static_cast(instance)); - return instance_numeric; - } - - void SaveModel(FILE* fs); - void LoadModel(FILE* fs); - - //void LoadPrunerModel(FILE* fs); - //void LoadPrunerModelByName(const string &model_name); - - void MakeParts(Instance *instance, Parts *parts, - std::vector *gold_outputs); - void MakeArcParts(Instance *instance, Parts *parts, - std::vector *gold_outputs); - void MakeSiblingParts(Instance *instance, - Parts *parts, - std::vector *gold_outputs); - - void MakeSelectedFeatures(Instance *instance, - Parts *parts, - const std::vector& selected_parts, - Features *features); - - void ComputeScores(Instance *instance, Parts *parts, Features *features, - std::vector *scores); - - void MakeFeatureDifference(Parts *parts, - Features *features, - const std::vector &gold_output, - const std::vector &predicted_output, - FeatureVector *difference); - - void MakeGradientStep(Parts *parts, - Features *features, - double eta, - int iteration, - const std::vector &gold_output, - const std::vector &predicted_output); - - void LabelInstance(Parts *parts, const std::vector &output, - Instance *instance); - - virtual void BeginEvaluation() { - num_head_mistakes_ = 0; - num_head_pruned_mistakes_ = 0; - num_heads_after_pruning_ = 0; - num_tokens_ = 0; - gettimeofday(&start_clock_, NULL); - } - virtual void EvaluateInstance(Instance *instance, - Instance *output_instance, - Parts *parts, - const std::vector &gold_outputs, - const std::vector &predicted_outputs) { - DependencyInstance *dependency_instance = - static_cast(instance); - DependencyInstance *dependency_output_instance = - static_cast(output_instance); - DependencyLabelerParts *dependency_parts = - static_cast(parts); - for (int m = 1; m < dependency_instance->size(); ++m) { - int head = -1; - int h = dependency_output_instance->GetHead(m); - int num_possible_heads = 0; - const vector &index_labeled_parts = - dependency_parts->FindArcs(m); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int r = index_labeled_parts[k]; - if (r < 0) continue; - ++num_possible_heads; - if (gold_outputs[r] >= 0.5) { - CHECK_EQ(gold_outputs[r], 1.0); - if (!NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { - ++num_head_mistakes_; - } - head = h; - //break; - } - } - if (head < 0) { - VLOG(2) << "Pruned gold part..."; - ++num_head_mistakes_; - ++num_head_pruned_mistakes_; - } - ++num_tokens_; - num_heads_after_pruning_ += num_possible_heads; - } - } - virtual void EndEvaluation() { - LOG(INFO) << "Labeling accuracy: " << - static_cast(num_tokens_ - num_head_mistakes_) / - static_cast(num_tokens_); - LOG(INFO) << "Pruning recall: " << - static_cast(num_tokens_ - num_head_pruned_mistakes_) / - static_cast(num_tokens_); - LOG(INFO) << "Pruning efficiency: " << - static_cast(num_heads_after_pruning_) / - static_cast(num_tokens_) +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef DEPENDENCYLABELERPIPE_H_ +#define DEPENDENCYLABELERPIPE_H_ + +#include "Pipe.h" +#include "TimeUtils.h" +#include "DependencyLabelerOptions.h" +#include "DependencyReader.h" +#include "DependencyDictionary.h" +#include "TokenDictionary.h" +#include "DependencyInstanceNumeric.h" +#include "DependencyWriter.h" +#include "DependencyLabelerPart.h" +#include "DependencyLabelerFeatures.h" +#include "DependencyLabelerDecoder.h" + +class DependencyLabelerPipe : public Pipe { +public: + DependencyLabelerPipe(Options* options) : Pipe(options) { + token_dictionary_ = NULL; + //pruner_parameters_ = NULL; + //train_pruner_ = false; + } + virtual ~DependencyLabelerPipe() { + //delete token_dictionary_; + //delete pruner_parameters_; + } + + // Same reader as the dependency parser. + DependencyReader *GetDependencyReader() { + return static_cast(reader_); + } + // Same dictionary as the dependency parser. + DependencyDictionary *GetDependencyDictionary() { + return static_cast(dictionary_); + } + DependencyLabelerDecoder *GetDependencyLabelerDecoder() { + return static_cast(decoder_); + } + DependencyLabelerOptions *GetDependencyLabelerOptions() { + return static_cast(options_); + } + + void Initialize() { + Pipe::Initialize(); + //pruner_parameters_ = new Parameters; + } + + //void SetPrunerParameters(Parameters *pruner_parameters) { + // pruner_parameters_ = pruner_parameters; + //} + //void LoadPrunerModelFile() { + // LoadPrunerModelByName(GetDependencyOptions()->GetPrunerModelFilePath()); + //} + +protected: + void CreateDictionary() { + dictionary_ = new DependencyDictionary(this); + GetDependencyDictionary()->SetTokenDictionary(token_dictionary_); + }; + void CreateReader() { reader_ = new DependencyReader; }; + void CreateWriter() { writer_ = new DependencyWriter; }; + void CreateDecoder() { decoder_ = new DependencyLabelerDecoder(this); }; + Parts *CreateParts() { return new DependencyLabelerParts; }; + Features *CreateFeatures() { return new DependencyLabelerFeatures(this); }; + + void CreateTokenDictionary() { + token_dictionary_ = new TokenDictionary(this); + }; + + Parameters *GetTrainingParameters() { + //if (train_pruner_) return pruner_parameters_; + return parameters_; + } + + void PreprocessData(); + + Instance *GetFormattedInstance(Instance *instance) { + DependencyInstanceNumeric *instance_numeric = + new DependencyInstanceNumeric; + instance_numeric->Initialize(*GetDependencyDictionary(), + static_cast(instance)); + return instance_numeric; + } + + void SaveModel(FILE* fs); + void LoadModel(FILE* fs); + + //void LoadPrunerModel(FILE* fs); + //void LoadPrunerModelByName(const string &model_name); + + void MakeParts(Instance *instance, Parts *parts, + std::vector *gold_outputs); + void MakeArcParts(Instance *instance, Parts *parts, + std::vector *gold_outputs); + void MakeSiblingParts(Instance *instance, + Parts *parts, + std::vector *gold_outputs); + + void MakeSelectedFeatures(Instance *instance, + Parts *parts, + const std::vector& selected_parts, + Features *features); + + void ComputeScores(Instance *instance, Parts *parts, Features *features, + std::vector *scores); + + void MakeFeatureDifference(Parts *parts, + Features *features, + const std::vector &gold_output, + const std::vector &predicted_output, + FeatureVector *difference); + + void MakeGradientStep(Parts *parts, + Features *features, + double eta, + int iteration, + const std::vector &gold_output, + const std::vector &predicted_output); + + void LabelInstance(Parts *parts, const std::vector &output, + Instance *instance); + + virtual void BeginEvaluation() { + num_head_mistakes_ = 0; + num_head_pruned_mistakes_ = 0; + num_heads_after_pruning_ = 0; + num_tokens_ = 0; + chrono.GetTime(); + } + virtual void EvaluateInstance(Instance *instance, + Instance *output_instance, + Parts *parts, + const std::vector &gold_outputs, + const std::vector &predicted_outputs) { + DependencyInstance *dependency_instance = + static_cast(instance); + DependencyInstance *dependency_output_instance = + static_cast(output_instance); + DependencyLabelerParts *dependency_parts = + static_cast(parts); + for (int m = 1; m < dependency_instance->size(); ++m) { + int head = -1; + int h = dependency_output_instance->GetHead(m); + int num_possible_heads = 0; + const vector &index_labeled_parts = + dependency_parts->FindArcs(m); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int r = index_labeled_parts[k]; + if (r < 0) continue; + ++num_possible_heads; + if (gold_outputs[r] >= 0.5) { + CHECK_EQ(gold_outputs[r], 1.0); + if (!NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { + ++num_head_mistakes_; + } + head = h; + //break; + } + } + if (head < 0) { + VLOG(2) << "Pruned gold part..."; + ++num_head_mistakes_; + ++num_head_pruned_mistakes_; + } + ++num_tokens_; + num_heads_after_pruning_ += num_possible_heads; + } + } + virtual void EndEvaluation() { + LOG(INFO) << "Labeling accuracy: " << + static_cast(num_tokens_ - num_head_mistakes_) / + static_cast(num_tokens_); + LOG(INFO) << "Pruning recall: " << + static_cast(num_tokens_ - num_head_pruned_mistakes_) / + static_cast(num_tokens_); + LOG(INFO) << "Pruning efficiency: " << + static_cast(num_heads_after_pruning_) / + static_cast(num_tokens_) << " possible labels per token."; - timeval end_clock; - gettimeofday(&end_clock, NULL); - double num_seconds = - static_cast(diff_ms(end_clock, start_clock_)) / 1000.0; - double tokens_per_second = static_cast(num_tokens_) / num_seconds; - LOG(INFO) << "Labeling speed: " - << tokens_per_second << " tokens per second."; - } - - void ComputeDescendents(const std::vector &heads, - std::vector >* descendents) const; - - void GetAllAncestors(const std::vector &heads, - int descend, - std::vector* ancestors) const; - - int GetSiblingLabel(int sibling, int modifier) { - CHECK_GE(sibling, -1); - CHECK_GE(modifier, -1); - int num_labels = GetDependencyDictionary()->GetLabelAlphabet().size(); - return ((1 + sibling) * (1 + num_labels) + (1 + modifier)); - } - - //bool ExistsPath(const vector &heads, - // int ancest, - // int descend) const; - //bool IsProjectiveArc(const vector &heads, int par, int ch) const; - -protected: - TokenDictionary *token_dictionary_; - //bool train_pruner_; - //Parameters *pruner_parameters_; - int num_head_mistakes_; - int num_head_pruned_mistakes_; - int num_heads_after_pruning_; + chrono.StopTime(); + double num_seconds = chrono.GetElapsedTime(); + double tokens_per_second = static_cast(num_tokens_) / num_seconds; + LOG(INFO) << "Labeling speed: " + << tokens_per_second << " tokens per second."; + } + + void ComputeDescendents(const std::vector &heads, + std::vector >* descendents) const; + + void GetAllAncestors(const std::vector &heads, + int descend, + std::vector* ancestors) const; + + int GetSiblingLabel(int sibling, int modifier) { + CHECK_GE(sibling, -1); + CHECK_GE(modifier, -1); + int num_labels = GetDependencyDictionary()->GetLabelAlphabet().size(); + return ((1 + sibling) * (1 + num_labels) + (1 + modifier)); + } + + //bool ExistsPath(const vector &heads, + // int ancest, + // int descend) const; + //bool IsProjectiveArc(const vector &heads, int par, int ch) const; + +protected: + TokenDictionary *token_dictionary_; + //bool train_pruner_; + //Parameters *pruner_parameters_; + int num_head_mistakes_; + int num_head_pruned_mistakes_; + int num_heads_after_pruning_; int num_tokens_; - timeval start_clock_; -}; - -#endif /* DEPENDENCYLABELERPIPE_H_ */ + chronowrap::Chronometer chrono; +}; + +#endif /* DEPENDENCYLABELERPIPE_H_ */ diff --git a/src/dependency_labeler/TurboDependencyLabeler.cpp b/src/dependency_labeler/TurboDependencyLabeler.cpp index 58b227d..154c2d8 100644 --- a/src/dependency_labeler/TurboDependencyLabeler.cpp +++ b/src/dependency_labeler/TurboDependencyLabeler.cpp @@ -1,103 +1,101 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "DependencyLabelerPipe.h" - -using namespace std; - -void TrainDependencyLabeler(); -void TestDependencyLabeler(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_train) { - LOG(INFO) << "Training dependency labeler..." << endl; - TrainDependencyLabeler(); - } else if (FLAGS_test) { - LOG(INFO) << "Running dependency labeler..." << endl; - TestDependencyLabeler(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainDependencyLabeler() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - DependencyLabelerOptions *options = new DependencyLabelerOptions; - options->Initialize(); - - DependencyLabelerPipe *pipe = new DependencyLabelerPipe(options); - pipe->Initialize(); - - LOG(INFO) << "Training the dependency labeler..."; - pipe->Train(); - pipe->SaveModelFile(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TestDependencyLabeler() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - DependencyLabelerOptions *options = new DependencyLabelerOptions; - options->Initialize(); - - DependencyLabelerPipe *pipe = new DependencyLabelerPipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - pipe->Run(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "DependencyLabelerPipe.h" + +using namespace std; + +void TrainDependencyLabeler(); +void TestDependencyLabeler(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_train) { + LOG(INFO) << "Training dependency labeler..." << endl; + TrainDependencyLabeler(); + } else if (FLAGS_test) { + LOG(INFO) << "Running dependency labeler..." << endl; + TestDependencyLabeler(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainDependencyLabeler() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + DependencyLabelerOptions *options = new DependencyLabelerOptions; + options->Initialize(); + + DependencyLabelerPipe *pipe = new DependencyLabelerPipe(options); + pipe->Initialize(); + + LOG(INFO) << "Training the dependency labeler..."; + pipe->Train(); + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestDependencyLabeler() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + DependencyLabelerOptions *options = new DependencyLabelerOptions; + options->Initialize(); + + DependencyLabelerPipe *pipe = new DependencyLabelerPipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/entity_recognizer/EntityDecoder.cpp b/src/entity_recognizer/EntityDecoder.cpp new file mode 100644 index 0000000..9aac8f1 --- /dev/null +++ b/src/entity_recognizer/EntityDecoder.cpp @@ -0,0 +1,93 @@ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "Dictionary.h" +#include "EntityDecoder.h" +#include "SequencePart.h" +#include "EntityPipe.h" +#include // Remove this. + +DEFINE_double(ner_train_cost_false_positives, 0.5, + "Cost for 'false positives' -- penalises recall and favours precision in BIO tagging."); +DEFINE_double(ner_train_cost_false_negatives, 0.5, + "Cost for 'false negatives' -- penalises precision and favours recall in BIO tagging."); + +void EntityDecoder::DecodeCostAugmented(Instance *instance, Parts *parts, + const vector &scores, + const vector &gold_output, + vector *predicted_output, + double *cost, + double *loss) { + + SequenceParts *sequence_parts = static_cast(parts); + int offset_unigrams, num_unigrams; + + sequence_parts->GetOffsetUnigram(&offset_unigrams, &num_unigrams); + + //////////////////////////////////////////////////// + // F1: a = 0.5, b = 0.5. + // Recall: a = 0, b = 1. + // In general: + // p = a - (a+b)*z0 + // q = b*sum(z0) + // p'*z + q = a*sum(z) - (a+b)*z0'*z + b*sum(z0) + // = a*(1-z0)'*z + b*(1-z)'*z0. + //////////////////////////////////////////////////// + + // Penalty for predicting 1 when it is 0 (FP). + double a = FLAGS_ner_train_cost_false_positives; + // Penalty for predicting 0 when it is 1 (FN). + double b = FLAGS_ner_train_cost_false_negatives; + //double b = 1 - a; + + // p = 0.5-z0, q = 0.5'*z0, loss = p'*z + q + double q = 0.0; + vector p(num_unigrams, 0.0); + + vector scores_cost = scores; + + for (int r = 0; r < num_unigrams; ++r) { + + SequenceDictionary *dictionary; + dictionary = static_cast(pipe_->GetSequenceDictionary()); + SequencePartUnigram *unigram_part = (static_cast((*sequence_parts)[r])); + int tag = unigram_part->tag(); + const std::string & tag_name = dictionary->GetTagName(tag); + EntityOptions *entity_options = static_cast(pipe_)->GetEntityOptions(); + if (tag_name[0] != 'O'){ // if inside (not outside) + p[r] = a - (a+b)*gold_output[offset_unigrams + r]; + q += b*gold_output[offset_unigrams + r]; + } else { + p[r] = 0; + } + scores_cost[offset_unigrams + r] += p[r]; + } + + Decode(instance, parts, scores_cost, predicted_output); + + *cost = q; + for (int r = 0; r < num_unigrams; ++r) { + *cost += p[r] * (*predicted_output)[offset_unigrams + r]; + } + + *loss = *cost; + for (int r = 0; r < parts->size(); ++r) { + *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); + } + +} diff --git a/src/entity_recognizer/EntityDecoder.h b/src/entity_recognizer/EntityDecoder.h new file mode 100644 index 0000000..2ca6d6d --- /dev/null +++ b/src/entity_recognizer/EntityDecoder.h @@ -0,0 +1,39 @@ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef ENTITYDECODER_H_ +#define ENTITYDECODER_H_ + +#include "../sequence/SequenceDecoder.h" + + +class EntityDecoder : public SequenceDecoder { +public: + EntityDecoder() {}; + EntityDecoder(SequencePipe *pipe) : SequenceDecoder(pipe) {}; // EntityPipe SequencePipe?? + virtual ~EntityDecoder() {}; + + virtual void DecodeCostAugmented(Instance *instance, Parts *parts, + const vector &scores, + const vector &gold_output, + vector *predicted_output, + double *cost, + double *loss); +}; + +#endif /* ENTITYDECODER_H_ */ diff --git a/src/entity_recognizer/EntityDictionary.cpp b/src/entity_recognizer/EntityDictionary.cpp index b87d7af..90ea704 100644 --- a/src/entity_recognizer/EntityDictionary.cpp +++ b/src/entity_recognizer/EntityDictionary.cpp @@ -1,292 +1,434 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "EntityDictionary.h" -#include "EntityOptions.h" -#include "EntityPipe.h" -#include - -void EntityDictionary::CreateTagDictionary(SequenceReader *reader) { - SequenceDictionary::CreateTagDictionary(reader); - - // TODO: the SplitEntityTag function should probably be elsewhere and not on - // EntityInstance. - EntityInstance instance; - Alphabet entities; - - // Display information about the entity tags. - LOG(INFO) << "Found " << tag_alphabet_.size() << " entity tags:"; - for (Alphabet::iterator it = tag_alphabet_.begin(); - it != tag_alphabet_.end(); ++it) { - std::string entity_tag = it->first; - LOG(INFO) << entity_tag; - - std::string prefix, entity; - instance.SplitEntityTag(it->first, &prefix, &entity); - if (entity != "") entities.Insert(entity); - } - - LOG(INFO) << "Entities:"; - for (Alphabet::iterator it = entities.begin(); it != entities.end(); ++it) { - LOG(INFO) << it->first; - } - - LOG(INFO) << "Computing allowed bigrams..."; - // Every bigram is allowed by default. - allowed_bigrams_.assign(1 + tag_alphabet_.size(), - std::vector(1 + tag_alphabet_.size(), true)); - // Now add the BIO-like constraints. - for (Alphabet::iterator it = entities.begin(); it != entities.end(); ++it) { - std::string entity = it->first; - LOG(INFO) << "Processing entity " << entity << "..."; - if (static_cast(pipe_)->GetEntityOptions()->tagging_scheme() == - EntityTaggingSchemes::BIO) { - int tag_begin = tag_alphabet_.Lookup("B-" + entity); - int tag_inside = tag_alphabet_.Lookup("I-" + entity); - if (tag_inside < 0) continue; - // An I-tag can only occur after a B-tag or another I-tag of the same - // entity. - for (int left_tag = -1; left_tag < tag_alphabet_.size(); ++left_tag) { - if (left_tag != tag_begin && left_tag != tag_inside) { - allowed_bigrams_[1 + tag_inside][1 + left_tag] = false; - } - } - } else if (static_cast(pipe_)->GetEntityOptions()-> - tagging_scheme() == EntityTaggingSchemes::BILOU) { - int tag_begin = tag_alphabet_.Lookup("B-" + entity); - int tag_inside = tag_alphabet_.Lookup("I-" + entity); - int tag_last = tag_alphabet_.Lookup("L-" + entity); - // I-tags and L-tags can only occur after a B-tag or an I-tag of the same - // entity. - for (int left_tag = -1; left_tag < tag_alphabet_.size(); ++left_tag) { - if (left_tag != tag_begin && left_tag != tag_inside) { - if (tag_inside >= 0) { - allowed_bigrams_[1 + tag_inside][1 + left_tag] = false; - } - if (tag_last >= 0) { - allowed_bigrams_[1 + tag_last][1 + left_tag] = false; - } - } - } - // I-tags and B-tags can only occur before an I-tag or an L-tag of the - // same entity. - for (int right_tag = -1; right_tag < tag_alphabet_.size(); ++right_tag) { - if (right_tag != tag_last && right_tag != tag_inside) { - if (tag_inside >= 0) { - allowed_bigrams_[1 + right_tag][1 + tag_inside] = false; - } - if (tag_begin >= 0) { - allowed_bigrams_[1 + right_tag][1 + tag_begin] = false; - } - } - } - } - } - - tag_alphabet_.BuildNames(); // Just to be able to plot readable information... - int num_allowed_bigrams = 0; - for (int tag = -1; tag < tag_alphabet_.size(); ++tag) { - for (int left_tag = -1; left_tag < tag_alphabet_.size(); ++left_tag) { - if (IsAllowedBigram(left_tag, tag)) { - std::string left_tag_name = (left_tag >= 0) ? - tag_alphabet_.GetName(left_tag) : "START"; - std::string tag_name = (tag >= 0) ? - tag_alphabet_.GetName(tag) : "STOP"; - - LOG(INFO) << "Allowed bigram: " - << left_tag_name - << " -> " - << tag_name; - - ++num_allowed_bigrams; - } - } - } - - LOG(INFO) << "Total allowed bigrams: " << num_allowed_bigrams; - - ReadGazetteerFiles(); -} - -void EntityDictionary::ReadGazetteerFiles() { - EntityOptions *options = - static_cast(pipe_->GetOptions()); - - gazetteer_word_alphabet_.AllowGrowth(); - gazetteer_entity_tag_alphabet_.AllowGrowth(); - - if (options->file_gazetteer() != "") { - LOG(INFO) << "Loading gazetteer file " - << options->file_gazetteer() << "..."; - std::ifstream is; - std::string line; - - // Do a first pass just to count the words and create the - // dictionaries. - is.open(options->file_gazetteer().c_str(), ifstream::in); - CHECK(is.good()) << "Could not open " - << options->file_gazetteer() << "."; - if (is.is_open()) { - while (!is.eof()) { - getline(is, line); - if (line == "") continue; // Ignore blank lines. - std::vector fields; - StringSplit(line, " \t", &fields, true); // Break on tabs or spaces. - if (fields.size() < 2) continue; - const std::string &entity_type = fields[0]; - gazetteer_entity_tag_alphabet_.Insert("B-" + entity_type); - gazetteer_entity_tag_alphabet_.Insert("I-" + entity_type); - gazetteer_entity_tag_alphabet_.Insert("L-" + entity_type); - gazetteer_entity_tag_alphabet_.Insert("U-" + entity_type); - for (int k = 1; k < fields.size(); ++k) { - const std::string &word = fields[k]; - gazetteer_word_alphabet_.Insert(word); - } - } - } - is.close(); - - // Now do the second pass to actually fill in the data. - gazetteer_word_entity_tags_.clear(); - gazetteer_word_entity_tags_.resize(gazetteer_word_alphabet_.size()); - is.open(options->file_gazetteer().c_str(), ifstream::in); - CHECK(is.good()) << "Could not open " - << options->file_gazetteer() << "."; - if (is.is_open()) { - while (!is.eof()) { - getline(is, line); - if (line == "") continue; // Ignore blank lines. - std::vector fields; - StringSplit(line, " \t", &fields, true); // Break on tabs or spaces. - if (fields.size() < 2) continue; - const std::string &entity_type = fields[0]; - int entity_type_begin_id = - gazetteer_entity_tag_alphabet_.Lookup("B-" + entity_type); - int entity_type_inside_id = - gazetteer_entity_tag_alphabet_.Lookup("I-" + entity_type); - int entity_type_last_id = - gazetteer_entity_tag_alphabet_.Lookup("L-" + entity_type); - int entity_type_unique_id = - gazetteer_entity_tag_alphabet_.Lookup("U-" + entity_type); - for (int k = 1; k < fields.size(); ++k) { - const std::string &word = fields[k]; - int word_id = gazetteer_word_alphabet_.Lookup(word); - CHECK_GE(word_id, 0); - CHECK_LT(word_id, gazetteer_word_entity_tags_.size()); - int entity_type_id = -1; - if (fields.size() == 2) { - entity_type_id = entity_type_unique_id; - } else if (k == 1) { - entity_type_id = entity_type_begin_id; - } else if (k == fields.size() - 1) { - entity_type_id = entity_type_last_id; - } else { - entity_type_id = entity_type_inside_id; - } - int l = -1; - for (l = 0; l < gazetteer_word_entity_tags_[word_id].size(); ++l) { - if (gazetteer_word_entity_tags_[word_id][l] == entity_type_id) { - break; - } - } - if (l == gazetteer_word_entity_tags_[word_id].size()) { - gazetteer_word_entity_tags_[word_id]. - push_back(entity_type_id); - } - } - } - } - is.close(); - } - - gazetteer_word_alphabet_.StopGrowth(); - gazetteer_entity_tag_alphabet_.StopGrowth(); - LOG(INFO) << "Number of gazetteer words: " - << gazetteer_word_alphabet_.size(); - LOG(INFO) << "Number of gazetteer entity tags: " - << gazetteer_entity_tag_alphabet_.size(); -} - -void EntityTokenDictionary::Initialize(EntityReader *reader) { - this->TokenDictionary::Initialize(reader); - - std::vector pos_freqs; - Alphabet pos_alphabet; - - std::string special_symbols[NUM_SPECIAL_TOKENS]; - special_symbols[TOKEN_UNKNOWN] = kTokenUnknown; - special_symbols[TOKEN_START] = kTokenStart; - special_symbols[TOKEN_STOP] = kTokenStop; - - for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { - pos_alphabet.Insert(special_symbols[i]); - - // Counts of special symbols are set to -1: - pos_freqs.push_back(-1); - } - - // Go through the corpus and build the dictionaries, - // counting the frequencies. - reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); - EntityInstance *instance = - static_cast(reader->GetNext()); - while (instance != NULL) { - int instance_length = instance->size(); - for (int i = 0; i < instance_length; ++i) { - int id; - // Add POS to alphabet. - id = pos_alphabet.Insert(instance->GetPosTag(i)); - if (id >= pos_freqs.size()) { - CHECK_EQ(id, pos_freqs.size()); - pos_freqs.push_back(0); - } - ++pos_freqs[id]; - } - delete instance; - instance = static_cast(reader->GetNext()); - } - reader->Close(); - - // Now adjust the cutoffs if necessary. - while (true) { - pos_alphabet_.clear(); - for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { - pos_alphabet_.Insert(special_symbols[i]); - } - for (const auto& pos_token : pos_alphabet) { - if (pos_freqs[pos_token.second] > pos_cutoff) { - pos_alphabet_.Insert(pos_token.first); - } - } - if (pos_alphabet_.size() < kMaxPosAlphabetSize) break; - ++pos_cutoff; - LOG(INFO) << "Incrementing POS cutoff to " << pos_cutoff << "..."; - } - - form_alphabet_.StopGrowth(); - form_lower_alphabet_.StopGrowth(); - lemma_alphabet_.StopGrowth(); - prefix_alphabet_.StopGrowth(); - suffix_alphabet_.StopGrowth(); - feats_alphabet_.StopGrowth(); - pos_alphabet_.StopGrowth(); - cpos_alphabet_.StopGrowth(); - - LOG(INFO) << "Number of pos: " << pos_alphabet_.size(); - CHECK_LT(pos_alphabet_.size(), 0xff); -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "EntityDictionary.h" +#include "EntityOptions.h" +#include "EntityPipe.h" +#include +#include + +EntityDictionary::EntityDictionary(Pipe* pipe) : SequenceDictionary(pipe) { + EntityOptions *options = + static_cast(pipe->GetOptions()); + gazetteer_case_sensitive_ = options->gazetteer_case_sensitive(); +} + +void EntityDictionary::CreateTagDictionary(SequenceReader *reader) { + SequenceDictionary::CreateTagDictionary(reader); + + // TODO: the SplitEntityTag function should probably be elsewhere and not on + // EntityInstance. + EntityInstance instance; + Alphabet entities; + + // Display information about the entity tags. + LOG(INFO) << "Found " << tag_alphabet_.size() << " entity tags:"; + for (Alphabet::iterator it = tag_alphabet_.begin(); + it != tag_alphabet_.end(); ++it) { + std::string entity_tag = it->first; + LOG(INFO) << entity_tag; + + std::string prefix, entity; + instance.SplitEntityTag(it->first, &prefix, &entity); + if (entity != "") entities.Insert(entity); + } + + LOG(INFO) << "Entities:"; + for (Alphabet::iterator it = entities.begin(); it != entities.end(); ++it) { + LOG(INFO) << it->first; + } + + LOG(INFO) << "Computing allowed bigrams..."; + // Every bigram is allowed by default. + allowed_bigrams_.assign(1 + tag_alphabet_.size(), + std::vector(1 + tag_alphabet_.size(), true)); + // Now add the BIO-like constraints. + for (Alphabet::iterator it = entities.begin(); it != entities.end(); ++it) { + std::string entity = it->first; + LOG(INFO) << "Processing entity " << entity << "..."; + if (static_cast(pipe_)->GetEntityOptions()->tagging_scheme() == + EntityTaggingSchemes::BIO) { + int tag_begin = tag_alphabet_.Lookup("B-" + entity); + int tag_inside = tag_alphabet_.Lookup("I-" + entity); + if (tag_inside < 0) continue; + // An I-tag can only occur after a B-tag or another I-tag of the same + // entity. + for (int left_tag = -1; left_tag < tag_alphabet_.size(); ++left_tag) { + if (left_tag != tag_begin && left_tag != tag_inside) { + allowed_bigrams_[1 + tag_inside][1 + left_tag] = false; + } + } + } else if (static_cast(pipe_)->GetEntityOptions()-> + tagging_scheme() == EntityTaggingSchemes::BILOU) { + int tag_begin = tag_alphabet_.Lookup("B-" + entity); + int tag_inside = tag_alphabet_.Lookup("I-" + entity); + int tag_last = tag_alphabet_.Lookup("L-" + entity); + // I-tags and L-tags can only occur after a B-tag or an I-tag of the same + // entity. + for (int left_tag = -1; left_tag < tag_alphabet_.size(); ++left_tag) { + if (left_tag != tag_begin && left_tag != tag_inside) { + if (tag_inside >= 0) { + allowed_bigrams_[1 + tag_inside][1 + left_tag] = false; + } + if (tag_last >= 0) { + allowed_bigrams_[1 + tag_last][1 + left_tag] = false; + } + } + } + // I-tags and B-tags can only occur before an I-tag or an L-tag of the + // same entity. + for (int right_tag = -1; right_tag < tag_alphabet_.size(); ++right_tag) { + if (right_tag != tag_last && right_tag != tag_inside) { + if (tag_inside >= 0) { + allowed_bigrams_[1 + right_tag][1 + tag_inside] = false; + } + if (tag_begin >= 0) { + allowed_bigrams_[1 + right_tag][1 + tag_begin] = false; + } + } + } + } + } + + tag_alphabet_.BuildNames(); // Just to be able to plot readable information... + int num_allowed_bigrams = 0; + for (int tag = -1; tag < tag_alphabet_.size(); ++tag) { + for (int left_tag = -1; left_tag < tag_alphabet_.size(); ++left_tag) { + if (IsAllowedBigram(left_tag, tag)) { + std::string left_tag_name = (left_tag >= 0) ? + tag_alphabet_.GetName(left_tag) : "START"; + std::string tag_name = (tag >= 0) ? + tag_alphabet_.GetName(tag) : "STOP"; + + LOG(INFO) << "Allowed bigram: " + << left_tag_name + << " -> " + << tag_name; + + ++num_allowed_bigrams; + } + } + } + + LOG(INFO) << "Total allowed bigrams: " << num_allowed_bigrams; + + ReadGazetteerFiles(); +} + +void EntityDictionary::ReadGazetteerFiles() { + EntityOptions *options = + static_cast(pipe_->GetOptions()); + gazetteer_case_sensitive_ = options->gazetteer_case_sensitive(); + + gazetteer_word_alphabet_.AllowGrowth(); + gazetteer_entity_tag_alphabet_.AllowGrowth(); + + if (options->file_gazetteer() != "") { + LOG(INFO) << "Loading gazetteer file " + << options->file_gazetteer() << "..."; + std::ifstream is; + std::string line; + + // Do a first pass just to count the words and create the + // dictionaries. + is.open(options->file_gazetteer().c_str(), ifstream::in); + CHECK(is.good()) << "Could not open " + << options->file_gazetteer() << "."; + if (is.is_open()) { + while (!is.eof()) { + getline(is, line); + if (line == "") continue; // Ignore blank lines. + std::vector fields; + StringSplit(line, " \t", &fields, true); // Break on tabs or spaces. + if (fields.size() < 2) continue; + const std::string &entity_type = fields[0]; + gazetteer_entity_tag_alphabet_.Insert("B-" + entity_type); + gazetteer_entity_tag_alphabet_.Insert("I-" + entity_type); + gazetteer_entity_tag_alphabet_.Insert("L-" + entity_type); + gazetteer_entity_tag_alphabet_.Insert("U-" + entity_type); + for (int k = 1; k < fields.size(); ++k) { + if (!gazetteer_case_sensitive_) { + std::transform(fields[k].begin(), fields[k].end(), + fields[k].begin(), ::tolower); + } + const std::string &word = fields[k]; + gazetteer_word_alphabet_.Insert(word); + } + } + } + is.close(); + + // Now do the second pass to actually fill in the data. + gazetteer_word_entity_tags_.clear(); + gazetteer_word_entity_tags_.resize(gazetteer_word_alphabet_.size()); + is.open(options->file_gazetteer().c_str(), ifstream::in); + CHECK(is.good()) << "Could not open " + << options->file_gazetteer() << "."; + if (is.is_open()) { + while (!is.eof()) { + getline(is, line); + if (line == "") continue; // Ignore blank lines. + std::vector fields; + StringSplit(line, " \t", &fields, true); // Break on tabs or spaces. + if (fields.size() < 2) continue; + const std::string &entity_type = fields[0]; + int entity_type_begin_id = + gazetteer_entity_tag_alphabet_.Lookup("B-" + entity_type); + int entity_type_inside_id = + gazetteer_entity_tag_alphabet_.Lookup("I-" + entity_type); + int entity_type_last_id = + gazetteer_entity_tag_alphabet_.Lookup("L-" + entity_type); + int entity_type_unique_id = + gazetteer_entity_tag_alphabet_.Lookup("U-" + entity_type); + for (int k = 1; k < fields.size(); ++k) { + if (!gazetteer_case_sensitive_) { + std::transform(fields[k].begin(), fields[k].end(), + fields[k].begin(), ::tolower); + } + const std::string &word = fields[k]; + int word_id = gazetteer_word_alphabet_.Lookup(word); + CHECK_GE(word_id, 0); + CHECK_LT(word_id, gazetteer_word_entity_tags_.size()); + int entity_type_id = -1; + if (fields.size() == 2) { + entity_type_id = entity_type_unique_id; + } else if (k == 1) { + entity_type_id = entity_type_begin_id; + } else if (k == fields.size() - 1) { + entity_type_id = entity_type_last_id; + } else { + entity_type_id = entity_type_inside_id; + } + int l = -1; + for (l = 0; l < gazetteer_word_entity_tags_[word_id].size(); ++l) { + if (gazetteer_word_entity_tags_[word_id][l] == entity_type_id) { + break; + } + } + if (l == gazetteer_word_entity_tags_[word_id].size()) { + gazetteer_word_entity_tags_[word_id]. + push_back(entity_type_id); + } + } + } + } + is.close(); + } + + gazetteer_word_alphabet_.StopGrowth(); + gazetteer_entity_tag_alphabet_.StopGrowth(); + LOG(INFO) << "Number of gazetteer words: " + << gazetteer_word_alphabet_.size(); + LOG(INFO) << "Number of gazetteer entity tags: " + << gazetteer_entity_tag_alphabet_.size(); +} + +void EntityTokenDictionary::Initialize(EntityReader *reader) { + SetTokenDictionaryFlagValues(); + LOG(INFO) << "Creating token dictionary..."; + + std::vector form_freqs; + std::vector form_lower_freqs; + std::vector shape_freqs; + std::vector pos_freqs; + Alphabet form_alphabet; + Alphabet form_lower_alphabet; + Alphabet shape_alphabet; + Alphabet pos_alphabet; + + std::string special_symbols[NUM_SPECIAL_TOKENS]; + special_symbols[TOKEN_UNKNOWN] = kTokenUnknown; + special_symbols[TOKEN_START] = kTokenStart; + special_symbols[TOKEN_STOP] = kTokenStop; + + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + prefix_alphabet_.Insert(special_symbols[i]); + suffix_alphabet_.Insert(special_symbols[i]); + form_alphabet.Insert(special_symbols[i]); + form_lower_alphabet.Insert(special_symbols[i]); + shape_alphabet.Insert(special_symbols[i]); + pos_alphabet.Insert(special_symbols[i]); + + // Counts of special symbols are set to -1: + form_freqs.push_back(-1); + form_lower_freqs.push_back(-1); + shape_freqs.push_back(-1); + pos_freqs.push_back(-1); + } + + // Go through the corpus and build the dictionaries, + // counting the frequencies. + reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); + EntityInstance *instance = + static_cast(reader->GetNext()); + while (instance != NULL) { + int instance_length = instance->size(); + for (int i = 0; i < instance_length; ++i) { + int id; + + // Add form to alphabet. + std::string form = instance->GetForm(i); + std::string form_lower(form); + std::transform(form_lower.begin(), form_lower.end(), + form_lower.begin(), ::tolower); + if (!form_case_sensitive) form = form_lower; + id = form_alphabet.Insert(form); + if (id >= form_freqs.size()) { + CHECK_EQ(id, form_freqs.size()); + form_freqs.push_back(0); + } + ++form_freqs[id]; + + // Add lower-case form to the alphabet. + id = form_lower_alphabet.Insert(form_lower); + if (id >= form_lower_freqs.size()) { + CHECK_EQ(id, form_lower_freqs.size()); + form_lower_freqs.push_back(0); + } + ++form_lower_freqs[id]; + + // Add prefix/suffix to alphabet. + std::string prefix = form.substr(0, prefix_length); + id = prefix_alphabet_.Insert(prefix); + int start = form.length() - suffix_length; + if (start < 0) start = 0; + std::string suffix = form.substr(start, suffix_length); + id = suffix_alphabet_.Insert(suffix); + + // Add shape to alphabet. + std::string shape; + GetWordShape(instance->GetForm(i), &shape); + id = shape_alphabet.Insert(shape); + if (id >= shape_freqs.size()) { + CHECK_EQ(id, shape_freqs.size()); + shape_freqs.push_back(0); + } + ++shape_freqs[id]; + + // Add POS to alphabet. + id = pos_alphabet.Insert(instance->GetPosTag(i)); + if (id >= pos_freqs.size()) { + CHECK_EQ(id, pos_freqs.size()); + pos_freqs.push_back(0); + } + ++pos_freqs[id]; + } + delete instance; + instance = static_cast(reader->GetNext()); + } + reader->Close(); + + // Now adjust the cutoffs if necessary. + while (true) { + form_alphabet_.clear(); + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + form_alphabet_.Insert(special_symbols[i]); + } + for (Alphabet::iterator iter = form_alphabet.begin(); + iter != form_alphabet.end(); + ++iter) { + if (form_freqs[iter->second] > form_cutoff) { + form_alphabet_.Insert(iter->first); + } + } + if (form_alphabet_.size() < kMaxFormAlphabetSize) break; + ++form_cutoff; + LOG(INFO) << "Incrementing form cutoff to " << form_cutoff << "..."; + } + + while (true) { + form_lower_alphabet_.clear(); + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + form_lower_alphabet_.Insert(special_symbols[i]); + } + for (Alphabet::iterator iter = form_lower_alphabet.begin(); + iter != form_lower_alphabet.end(); + ++iter) { + if (form_lower_freqs[iter->second] > form_lower_cutoff) { + form_lower_alphabet_.Insert(iter->first); + } + } + if (form_lower_alphabet_.size() < kMaxFormAlphabetSize) break; + ++form_lower_cutoff; + LOG(INFO) << "Incrementing lower-case form cutoff to " + << form_lower_cutoff << "..."; + } + + while (true) { + shape_alphabet_.clear(); + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + shape_alphabet_.Insert(special_symbols[i]); + } + for (Alphabet::iterator iter = shape_alphabet.begin(); + iter != shape_alphabet.end(); + ++iter) { + if (shape_freqs[iter->second] > shape_cutoff) { + shape_alphabet_.Insert(iter->first); + } + } + if (shape_alphabet_.size() < kMaxShapeAlphabetSize) break; + ++shape_cutoff; + LOG(INFO) << "Incrementing shape cutoff to " << shape_cutoff << "..."; + } + + while (true) { + pos_alphabet_.clear(); + for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { + pos_alphabet_.Insert(special_symbols[i]); + } + for (const auto& pos_token : pos_alphabet) { + if (pos_freqs[pos_token.second] > pos_cutoff) { + pos_alphabet_.Insert(pos_token.first); + } + } + if (pos_alphabet_.size() < kMaxPosAlphabetSize) break; + ++pos_cutoff; + LOG(INFO) << "Incrementing POS cutoff to " << pos_cutoff << "..."; + } + + form_alphabet_.StopGrowth(); + form_lower_alphabet_.StopGrowth(); + shape_alphabet_.StopGrowth(); + lemma_alphabet_.StopGrowth(); + prefix_alphabet_.StopGrowth(); + suffix_alphabet_.StopGrowth(); + feats_alphabet_.StopGrowth(); + pos_alphabet_.StopGrowth(); + cpos_alphabet_.StopGrowth(); + + LOG(INFO) << "Number of forms: " << form_alphabet_.size() << endl + << "Number of lower-case forms: " << form_lower_alphabet_.size() << endl + << "Number of prefixes: " << prefix_alphabet_.size() << endl + << "Number of suffixes: " << suffix_alphabet_.size() << endl + << "Number of word shapes: " << shape_alphabet_.size() << endl + << "Number of pos: " << pos_alphabet_.size(); + + CHECK_LT(form_alphabet_.size(), 0xffff); + CHECK_LT(form_lower_alphabet_.size(), 0xffff); + CHECK_LT(shape_alphabet_.size(), 0xffff); + CHECK_LT(lemma_alphabet_.size(), 0xffff); + CHECK_LT(prefix_alphabet_.size(), 0xffff); + CHECK_LT(suffix_alphabet_.size(), 0xffff); + CHECK_LT(feats_alphabet_.size(), 0xffff); + CHECK_LT(pos_alphabet_.size(), 0xff); + CHECK_LT(cpos_alphabet_.size(), 0xff); + +#ifndef NDEBUG + BuildNames(); +#endif +} diff --git a/src/entity_recognizer/EntityDictionary.h b/src/entity_recognizer/EntityDictionary.h index c047473..2c88b7f 100644 --- a/src/entity_recognizer/EntityDictionary.h +++ b/src/entity_recognizer/EntityDictionary.h @@ -1,154 +1,159 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef ENTITYDICTIONARY_H_ -#define ENTITYDICTIONARY_H_ - -#include "SequenceDictionary.h" -#include "TokenDictionary.h" -#include "EntityReader.h" - -class EntityDictionary : public SequenceDictionary { -public: - EntityDictionary() {} - EntityDictionary(Pipe* pipe) : SequenceDictionary(pipe) {} - virtual ~EntityDictionary() {} - - void Clear() { - SequenceDictionary::Clear(); - - gazetteer_word_alphabet_.clear(); - gazetteer_entity_tag_alphabet_.clear(); - gazetteer_word_entity_tags_.clear(); - } - - void Save(FILE *fs) { - SequenceDictionary::Save(fs); - - if (0 > gazetteer_word_alphabet_.Save(fs)) CHECK(false); - if (0 > gazetteer_entity_tag_alphabet_.Save(fs)) CHECK(false); - - bool success; - int length = gazetteer_word_entity_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < gazetteer_word_entity_tags_.size(); ++j) { - length = gazetteer_word_entity_tags_[j].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int k = 0; k < gazetteer_word_entity_tags_[j].size(); ++k) { - int id = gazetteer_word_entity_tags_[j][k]; - success = WriteInteger(fs, id); - CHECK(success); - } - } - - length = allowed_bigrams_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < allowed_bigrams_.size(); ++j) { - length = allowed_bigrams_[j].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int k = 0; k < allowed_bigrams_[j].size(); ++k) { - bool allowed = allowed_bigrams_[j][k]; - success = WriteBool(fs, allowed); - CHECK(success); - } - } - } - - void Load(FILE *fs) { - SequenceDictionary::Load(fs); - - if (0 > gazetteer_word_alphabet_.Load(fs)) CHECK(false); - if (0 > gazetteer_entity_tag_alphabet_.Load(fs)) CHECK(false); - - int length; - bool success = ReadInteger(fs, &length); - CHECK(success); - gazetteer_word_entity_tags_.resize(length); - for (int j = 0; j < gazetteer_word_entity_tags_.size(); ++j) { - success = ReadInteger(fs, &length); - CHECK(success); - gazetteer_word_entity_tags_[j].resize(length); - for (int k = 0; k < gazetteer_word_entity_tags_[j].size(); ++k) { - int id; - success = ReadInteger(fs, &id); - CHECK(success); - gazetteer_word_entity_tags_[j][k] = id; - } - } - - gazetteer_word_alphabet_.StopGrowth(); - gazetteer_entity_tag_alphabet_.StopGrowth(); - LOG(INFO) << "Number of gazetteer words: " - << gazetteer_word_alphabet_.size(); - LOG(INFO) << "Number of gazetteer entity tags: " - << gazetteer_entity_tag_alphabet_.size(); - - success = ReadInteger(fs, &length); - CHECK(success); - allowed_bigrams_.resize(length); - for (int j = 0; j < allowed_bigrams_.size(); ++j) { - success = ReadInteger(fs, &length); - CHECK(success); - allowed_bigrams_[j].resize(length); - for (int k = 0; k < allowed_bigrams_[j].size(); ++k) { - bool allowed; - success = ReadBool(fs, &allowed); - CHECK(success); - allowed_bigrams_[j][k] = allowed; - } - } - } - - void CreateTagDictionary(SequenceReader *reader); - - void ReadGazetteerFiles(); - - void GetWordGazetteerIds(const std::string &word, - std::vector *gazetteer_ids) const { - gazetteer_ids->clear(); - int id = gazetteer_word_alphabet_.Lookup(word); - if (id >= 0) { - gazetteer_ids->assign(gazetteer_word_entity_tags_[id].begin(), - gazetteer_word_entity_tags_[id].end()); - } - } - - bool IsAllowedBigram(int left_tag, int tag) { - CHECK_GE(left_tag, -1); - CHECK_GE(tag, -1); - return allowed_bigrams_[tag + 1][left_tag + 1]; - } - -protected: - std::vector > allowed_bigrams_; - Alphabet gazetteer_word_alphabet_; - Alphabet gazetteer_entity_tag_alphabet_; - std::vector > gazetteer_word_entity_tags_; -}; - -class EntityTokenDictionary : public TokenDictionary { -public: - EntityTokenDictionary() {}; - virtual ~EntityTokenDictionary() {}; - void Initialize(EntityReader *reader); -}; -#endif /* ENTITYDICTIONARY_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef ENTITYDICTIONARY_H_ +#define ENTITYDICTIONARY_H_ + +#include "SequenceDictionary.h" +#include "TokenDictionary.h" +#include "EntityReader.h" + +class EntityDictionary : public SequenceDictionary { +public: + EntityDictionary() {} + EntityDictionary(Pipe* pipe); + virtual ~EntityDictionary() {} + + void Clear() { + SequenceDictionary::Clear(); + + gazetteer_word_alphabet_.clear(); + gazetteer_entity_tag_alphabet_.clear(); + gazetteer_word_entity_tags_.clear(); + } + + void Save(FILE *fs) { + SequenceDictionary::Save(fs); + + if (0 > gazetteer_word_alphabet_.Save(fs)) CHECK(false); + if (0 > gazetteer_entity_tag_alphabet_.Save(fs)) CHECK(false); + + bool success; + int length = (int)gazetteer_word_entity_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < gazetteer_word_entity_tags_.size(); ++j) { + length = (int)gazetteer_word_entity_tags_[j].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int k = 0; k < gazetteer_word_entity_tags_[j].size(); ++k) { + int id = gazetteer_word_entity_tags_[j][k]; + success = WriteInteger(fs, id); + CHECK(success); + } + } + + length = (int)allowed_bigrams_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < allowed_bigrams_.size(); ++j) { + length = (int)allowed_bigrams_[j].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int k = 0; k < allowed_bigrams_[j].size(); ++k) { + bool allowed = allowed_bigrams_[j][k]; + success = WriteBool(fs, allowed); + CHECK(success); + } + } + } + + void Load(FILE *fs) { + SequenceDictionary::Load(fs); + + if (0 > gazetteer_word_alphabet_.Load(fs)) CHECK(false); + if (0 > gazetteer_entity_tag_alphabet_.Load(fs)) CHECK(false); + + int length; + bool success = ReadInteger(fs, &length); + CHECK(success); + gazetteer_word_entity_tags_.resize(length); + for (int j = 0; j < gazetteer_word_entity_tags_.size(); ++j) { + success = ReadInteger(fs, &length); + CHECK(success); + gazetteer_word_entity_tags_[j].resize(length); + for (int k = 0; k < gazetteer_word_entity_tags_[j].size(); ++k) { + int id; + success = ReadInteger(fs, &id); + CHECK(success); + gazetteer_word_entity_tags_[j][k] = id; + } + } + + gazetteer_word_alphabet_.StopGrowth(); + gazetteer_entity_tag_alphabet_.StopGrowth(); + LOG(INFO) << "Number of gazetteer words: " + << gazetteer_word_alphabet_.size(); + LOG(INFO) << "Number of gazetteer entity tags: " + << gazetteer_entity_tag_alphabet_.size(); + + success = ReadInteger(fs, &length); + CHECK(success); + allowed_bigrams_.resize(length); + for (int j = 0; j < allowed_bigrams_.size(); ++j) { + success = ReadInteger(fs, &length); + CHECK(success); + allowed_bigrams_[j].resize(length); + for (int k = 0; k < allowed_bigrams_[j].size(); ++k) { + bool allowed; + success = ReadBool(fs, &allowed); + CHECK(success); + allowed_bigrams_[j][k] = allowed; + } + } + } + + void CreateTagDictionary(SequenceReader *reader); + + void ReadGazetteerFiles(); + + void GetWordGazetteerIds(const std::string &word, + std::vector *gazetteer_ids) const { + gazetteer_ids->clear(); + int id = gazetteer_word_alphabet_.Lookup(word); + if (id >= 0) { + gazetteer_ids->assign(gazetteer_word_entity_tags_[id].begin(), + gazetteer_word_entity_tags_[id].end()); + } + } + + bool IsAllowedBigram(int left_tag, int tag) { + CHECK_GE(left_tag, -1); + CHECK_GE(tag, -1); + return allowed_bigrams_[tag + 1][left_tag + 1]; + } + + const bool gazetteer_case_sensitive() const { + return gazetteer_case_sensitive_; + } + +protected: + std::vector > allowed_bigrams_; + Alphabet gazetteer_word_alphabet_; + Alphabet gazetteer_entity_tag_alphabet_; + std::vector > gazetteer_word_entity_tags_; + bool gazetteer_case_sensitive_; //stores the value of the corresponding option flag +}; + +class EntityTokenDictionary : public TokenDictionary { +public: + EntityTokenDictionary() {}; + virtual ~EntityTokenDictionary() {}; + void Initialize(EntityReader *reader); +}; +#endif /* ENTITYDICTIONARY_H_ */ diff --git a/src/entity_recognizer/EntityFeatureTemplates.h b/src/entity_recognizer/EntityFeatureTemplates.h index 08ff7c0..68302ff 100644 --- a/src/entity_recognizer/EntityFeatureTemplates.h +++ b/src/entity_recognizer/EntityFeatureTemplates.h @@ -29,56 +29,61 @@ struct EntityFeatureTemplateParts { struct EntityFeatureTemplateUnigram { enum types { - BIAS = 0, /* bias */ - W, /* word */ - pW, /* word on the left */ - nW, /* word on the right */ - ppW, /* word two positions on the left */ - nnW, /* word two positions on the right */ - G, /* gazetteer */ - pG, /* gazetteer on the left */ - nG, /* gazetteer on the right */ - ppG, /* gazetteer two positions on the left */ - nnG, /* gazetteer two positions on the right */ - P, /* POS */ - PpP, /* POS + POS on the left */ - PnP, /* POS + POS on the right */ - PpPppP, /* POS trigram on the left */ - PnPnnP, /* POS trigram on the right */ - PpPnP, /* POS trigram on the center */ - S, /* shape */ - pS, /* shape on the left */ - nS, /* shape on the right */ - ppS, /* shape two positions on the left */ - nnS, /* shape two positions on the right */ - A, /* prefix */ - Z, /* suffix */ - FLAG, /* flag indicating presence of special characters */ + BIAS = 0, /* bias */ //WORD + W, /* word */ //WORD + pW, /* word on the left */ //CONTEXT + nW, /* word on the right */ //CONTEXT + ppW, /* word two positions on the left */ //CONTEXT + nnW, /* word two positions on the right */ //CONTEXT + G, /* gazetteer */ //WORD + pG, /* gazetteer on the left */ //CONTEXT + nG, /* gazetteer on the right */ //CONTEXT + ppG, /* gazetteer two positions on the left */ //CONTEXT + nnG, /* gazetteer two positions on the right */ //CONTEXT + P, /* POS */ //WORD + PpP, /* POS + POS on the left */ //CONTEXT + PnP, /* POS + POS on the right */ //CONTEXT + PpPppP, /* POS trigram on the left */ //CONTEXT + PnPnnP, /* POS trigram on the right */ //CONTEXT + PpPnP, /* POS trigram on the center */ //CONTEXT + S, /* shape */ //WORD + pS, /* shape on the left */ //CONTEXT + nS, /* shape on the right */ //CONTEXT + ppS, /* shape two positions on the left */ //CONTEXT + nnS, /* shape two positions on the right */ //CONTEXT + A, /* prefix */ //WORD + Z, /* suffix */ //WORD + FLAG, /* flag indicating presence of special characters */ //WORD COUNT }; }; struct EntityFeatureTemplateBigram { enum types { - BIAS = 0, /* bias */ - W, /* word */ - pW, /* word on the left */ - nW, /* word on the right */ - ppW, /* word two positions on the left */ - nnW, /* word two positions on the right */ - P, /* POS */ - PpP, /* POS + POS on the left */ - PnP, /* POS + POS on the right */ - PpPppP, /* POS trigram on the left */ - PnPnnP, /* POS trigram on the right */ - PpPnP, /* POS trigram on the center */ + BIAS = 0, /* bias */ + W, /* word */ + pW, /* word on the left */ + nW, /* word on the right */ + ppW, /* word two positions on the left */ + nnW, /* word two positions on the right */ + P, /* POS */ + PpP, /* POS + POS on the left */ + PnP, /* POS + POS on the right */ + PpPppP, /* POS trigram on the left */ + PnPnnP, /* POS trigram on the right */ + PpPnP, /* POS trigram on the center */ + S, /* shape */ + pS, /* shape on the left */ + nS, /* shape on the right */ + ppS, /* shape two positions on the left */ + nnS, /* shape two positions on the right */ COUNT }; }; struct EntityFeatureTemplateTrigram { enum types { - BIAS = 0, /* bias */ + BIAS = 0, /* bias */ }; }; diff --git a/src/entity_recognizer/EntityFeatures.cpp b/src/entity_recognizer/EntityFeatures.cpp index 430ec49..43cfc0e 100644 --- a/src/entity_recognizer/EntityFeatures.cpp +++ b/src/entity_recognizer/EntityFeatures.cpp @@ -20,6 +20,7 @@ #include "EntityFeatures.h" #include "SequencePart.h" #include "EntityFeatureTemplates.h" +#include void EntityFeatures::AddUnigramFeatures(SequenceInstanceNumeric *sentence, int position) { @@ -33,6 +34,9 @@ void EntityFeatures::AddUnigramFeatures(SequenceInstanceNumeric *sentence, EntityInstanceNumeric *entity_sentence = static_cast(sentence); + EntityOptions *options = static_cast(pipe_)-> + GetEntityOptions(); + // Array of form IDs. const vector* word_ids = &entity_sentence->GetFormIds(); @@ -70,14 +74,16 @@ void EntityFeatures::AddUnigramFeatures(SequenceInstanceNumeric *sentence, entity_sentence->GetGazetteerIds(position + 2) : empty_GIDs; // POS tags. - uint8_t PID = (*pos_ids)[position]; // Current word. + uint8_t PID = (*pos_ids)[position]; // Current POS. // POS on the left. - uint8_t pPID = (position > 0) ? (*pos_ids)[position - 1] : TOKEN_START; + uint8_t pPID = (position > 0) ? + (*pos_ids)[position - 1] : TOKEN_START; // POS on the right. uint8_t nPID = (position < sentence_length - 1) ? (*pos_ids)[position + 1] : TOKEN_STOP; // POS two positions on the left. - uint8_t ppPID = (position > 1) ? (*pos_ids)[position - 2] : TOKEN_START; + uint8_t ppPID = (position > 1) ? + (*pos_ids)[position - 2] : TOKEN_START; // POS two positions on the right. uint8_t nnPID = (position < sentence_length - 2) ? (*pos_ids)[position + 2] : TOKEN_STOP; @@ -134,43 +140,51 @@ void EntityFeatures::AddUnigramFeatures(SequenceInstanceNumeric *sentence, AddFeature(fkey, features); // Lexical features. - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::W, flags, WID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::W, + flags, WID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::pW, flags, pWID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::pW, + flags, pWID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nW, flags, nWID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nW, + flags, nWID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::ppW, flags, ppWID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::ppW, + flags, ppWID); AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nnW, flags, nnWID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nnW, + flags, nnWID); AddFeature(fkey, features); // Gazetteer features. for (int k = 0; k < GIDs.size(); ++k) { uint16_t GID = GIDs[k]; - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::G, flags, GID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::G, + flags, GID); AddFeature(fkey, features); } for (int k = 0; k < pGIDs.size(); ++k) { uint16_t pGID = pGIDs[k]; - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::pG, flags, pGID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::pG, + flags, pGID); AddFeature(fkey, features); } for (int k = 0; k < nGIDs.size(); ++k) { uint16_t nGID = nGIDs[k]; - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nG, flags, nGID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nG, + flags, nGID); AddFeature(fkey, features); } for (int k = 0; k < ppGIDs.size(); ++k) { uint16_t ppGID = ppGIDs[k]; - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::ppG, flags, - ppGID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::ppG, + flags, ppGID); AddFeature(fkey, features); } for (int k = 0; k < nnGIDs.size(); ++k) { uint16_t nnGID = nnGIDs[k]; - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nnG, flags, - nnGID); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateUnigram::nnG, + flags, nnGID); AddFeature(fkey, features); } @@ -252,16 +266,16 @@ void EntityFeatures::AddBigramFeatures(SequenceInstanceNumeric *sentence, // of words. If position = N, we need to be careful not to access invalid // memory in arrays. - // Bias feature. - fkey = encoder_.CreateFKey_NONE(EntityFeatureTemplateBigram::BIAS, flags); - AddFeature(fkey, features); - // Add other bigram features. int sentence_length = sentence->size(); EntityInstanceNumeric *entity_sentence = static_cast(sentence); + EntityOptions *options = static_cast(pipe_)-> + GetEntityOptions(); + std::bitset<32> feature_set_bitmap(options->large_feature_set()); + // Array of form IDs. const vector* word_ids = &entity_sentence->GetFormIds(); @@ -269,77 +283,125 @@ void EntityFeatures::AddBigramFeatures(SequenceInstanceNumeric *sentence, const vector* pos_ids = &entity_sentence->GetPosIds(); // Words. - uint16_t WID = (position < sentence_length) ? - (*word_ids)[position] : TOKEN_STOP; // Current word. - // Word on the left. - uint16_t pWID = (position > 0) ? - (*word_ids)[position - 1] : TOKEN_START; - // Word on the right. - uint16_t nWID = (position < sentence_length - 1) ? - (*word_ids)[position + 1] : TOKEN_STOP; - // Word two positions on the left. - uint16_t ppWID = (position > 1) ? - (*word_ids)[position - 2] : TOKEN_START; - // Word two positions on the right. - uint16_t nnWID = (position < sentence_length - 2) ? - (*word_ids)[position + 2] : TOKEN_STOP; + uint16_t WID, pWID, nWID, ppWID, nnWID; + if (feature_set_bitmap.test(0)) { + WID = (position < sentence_length) ? + (*word_ids)[position] : TOKEN_STOP; // Current word. + // Word on the left. + pWID = (position > 0) ? + (*word_ids)[position - 1] : TOKEN_START; + // Word on the right. + nWID = (position < sentence_length - 1) ? + (*word_ids)[position + 1] : TOKEN_STOP; + // Word two positions on the left. + ppWID = (position > 1) ? + (*word_ids)[position - 2] : TOKEN_START; + // Word two positions on the right. + nnWID = (position < sentence_length - 2) ? + (*word_ids)[position + 2] : TOKEN_STOP; + } // POS tags. - uint8_t PID = (position < sentence_length) ? - (*pos_ids)[position] : TOKEN_STOP; // Current POS. - // POS on the left. - uint8_t pPID = (position > 0) ? - (*pos_ids)[position - 1] : TOKEN_START; - // POS on the right. - uint8_t nPID = (position < sentence_length - 1) ? - (*pos_ids)[position + 1] : TOKEN_STOP; - // POS two positions on the left. - uint8_t ppPID = (position > 1) ? - (*pos_ids)[position - 2] : TOKEN_START; - // POS two positions on the right. - uint8_t nnPID = (position < sentence_length - 2) ? - (*pos_ids)[position + 2] : TOKEN_STOP; + uint8_t PID, pPID, nPID, ppPID, nnPID; + if (feature_set_bitmap.test(1)) { + PID = (position < sentence_length) ? + (*pos_ids)[position] : TOKEN_STOP; // Current POS. + // POS on the left. + pPID = (position > 0) ? + (*pos_ids)[position - 1] : TOKEN_START; + // POS on the right. + nPID = (position < sentence_length - 1) ? + (*pos_ids)[position + 1] : TOKEN_STOP; + // POS two positions on the left. + ppPID = (position > 1) ? + (*pos_ids)[position - 2] : TOKEN_START; + // POS two positions on the right. + nnPID = (position < sentence_length - 2) ? + (*pos_ids)[position + 2] : TOKEN_STOP; + } + + // Word shapes. + uint16_t SID, pSID, nSID, ppSID, nnSID; + if (feature_set_bitmap.test(2)) { + SID = (position < sentence_length) ? + sentence->GetShapeId(position) : TOKEN_STOP; // Current shape. + // Shape on the left. + pSID = (position > 0) ? + sentence->GetShapeId(position - 1) : TOKEN_START; + // Shape on the right. + nSID = (position < sentence_length - 1) ? + sentence->GetShapeId(position + 1) : TOKEN_STOP; + // Shape two positions on the left. + ppSID = (position > 1) ? + sentence->GetShapeId(position - 2) : TOKEN_START; + // Shape two positions on the right. + nnSID = (position < sentence_length - 2) ? + sentence->GetShapeId(position + 2) : TOKEN_STOP; + } // Maximum is 255 feature templates. CHECK_LT(EntityFeatureTemplateBigram::COUNT, 256); // Bias feature. - //fkey = encoder_.CreateFKey_NONE(EntityFeatureTemplateBigram::BIAS, flags); - //AddFeature(fkey, features); + fkey = encoder_.CreateFKey_NONE(EntityFeatureTemplateBigram::BIAS, flags); + AddFeature(fkey, features); // Lexical features. - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::W, - flags, WID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::pW, - flags, pWID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::nW, - flags, nWID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::ppW, - flags, ppWID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::nnW, - flags, nnWID); - AddFeature(fkey, features); + if (feature_set_bitmap.test(0)) { + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::W, + flags, WID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::pW, + flags, pWID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::nW, + flags, nWID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::ppW, + flags, ppWID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::nnW, + flags, nnWID); + AddFeature(fkey, features); + } // POS features. - fkey = encoder_.CreateFKey_P(EntityFeatureTemplateBigram::P, - flags, PID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_PP(EntityFeatureTemplateBigram::PpP, - flags, PID, pPID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_PP(EntityFeatureTemplateBigram::PnP, - flags, PID, nPID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_PPP(EntityFeatureTemplateBigram::PpPppP, - flags, PID, pPID, ppPID); - AddFeature(fkey, features); - fkey = encoder_.CreateFKey_PPP(EntityFeatureTemplateBigram::PnPnnP, - flags, PID, nPID, nnPID); - AddFeature(fkey, features); + if (feature_set_bitmap.test(1)) { + fkey = encoder_.CreateFKey_P(EntityFeatureTemplateBigram::P, + flags, PID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_PP(EntityFeatureTemplateBigram::PpP, + flags, PID, pPID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_PP(EntityFeatureTemplateBigram::PnP, + flags, PID, nPID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_PPP(EntityFeatureTemplateBigram::PpPppP, + flags, PID, pPID, ppPID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_PPP(EntityFeatureTemplateBigram::PnPnnP, + flags, PID, nPID, nnPID); + AddFeature(fkey, features); + } + + // Shape features. + if (feature_set_bitmap.test(2)) { + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::S, + flags, SID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::pS, + flags, pSID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::nS, + flags, nSID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::ppS, + flags, ppSID); + AddFeature(fkey, features); + fkey = encoder_.CreateFKey_W(EntityFeatureTemplateBigram::nnS, + flags, nnSID); + AddFeature(fkey, features); + } } void EntityFeatures::AddTrigramFeatures(SequenceInstanceNumeric *sentence, diff --git a/src/entity_recognizer/EntityInstance.cpp b/src/entity_recognizer/EntityInstance.cpp index 477ef1a..0c37e7d 100644 --- a/src/entity_recognizer/EntityInstance.cpp +++ b/src/entity_recognizer/EntityInstance.cpp @@ -19,6 +19,7 @@ #include "EntityInstance.h" #include #include "EntityOptions.h" +#include void EntityInstance::Initialize(const std::vector &forms, const std::vector &pos, @@ -29,10 +30,9 @@ void EntityInstance::Initialize(const std::vector &forms, } void EntityInstance::ConvertToTaggingScheme(int tagging_scheme) { - std::vector spans; + std::vector> spans; CreateSpansFromTags(tags_, &spans); CreateTagsFromSpans(tags_.size(), spans, tagging_scheme, &tags_); - DeleteSpans(&spans); } void EntityInstance::SplitEntityTag(const std::string &tag, @@ -50,46 +50,39 @@ void EntityInstance::SplitEntityTag(const std::string &tag, void EntityInstance::CreateSpansFromTags( const std::vector &tags, - std::vector *spans) const { + std::vector> *spans) const { spans->clear(); - EntitySpan *span = NULL; + std::unique_ptr span; for (int i = 0; i < tags.size(); ++i) { std::string prefix, entity; SplitEntityTag(tags[i], &prefix, &entity); if (prefix == "B" || prefix == "U") { - if (span) spans->push_back(span); - span = new EntitySpan(i, i, entity); + if (span) spans->push_back(std::move(span)); + span = std::make_unique(i, i, entity); } else if (prefix == "I" || prefix == "L") { if (span && span->name() == entity) { span->set_end(i); } else { // This I is actually a B (maybe the file has IO encoding). - if (span) spans->push_back(span); - span = new EntitySpan(i, i, entity); + if (span) spans->push_back(std::move(span)); + span = std::make_unique(i, i, entity); } } else if (prefix == "O") { - if (span) spans->push_back(span); - span = NULL; + if (span) spans->push_back(std::move(span)); + span.reset(); } } - if (span) spans->push_back(span); -} - -void EntityInstance::DeleteSpans(std::vector *spans) const { - for (int k = 0; k < spans->size(); ++k) { - delete (*spans)[k]; - } - spans->clear(); + if (span) spans->push_back(std::move(span)); } void EntityInstance::CreateTagsFromSpans( int length, - const std::vector &spans, + const std::vector> &spans, int tagging_scheme, std::vector *tags) const { tags->assign(length, "O"); for (int k = 0; k < spans.size(); ++k) { - EntitySpan *span = spans[k]; + EntitySpan *span = spans[k].get(); if (tagging_scheme == EntityTaggingSchemes::BILOU) { if (span->start() == span->end()) { (*tags)[span->start()] = "U-" + span->name(); diff --git a/src/entity_recognizer/EntityInstance.h b/src/entity_recognizer/EntityInstance.h index 286bec1..a350a92 100644 --- a/src/entity_recognizer/EntityInstance.h +++ b/src/entity_recognizer/EntityInstance.h @@ -18,7 +18,7 @@ #ifndef ENTITYINSTANCE_H_ #define ENTITYINSTANCE_H_ - +#include #include "SequenceInstance.h" #include "EntitySpan.h" @@ -48,15 +48,13 @@ class EntityInstance : public SequenceInstance { // Convert a sequence of IO/BIO/BILOU tags into spans. void CreateSpansFromTags(const std::vector &tags, - std::vector *spans) const; + std::vector> *spans) const; // Convert spans into a sequence of IO/BIO/BILOU tags. void CreateTagsFromSpans(int length, - const std::vector &spans, + const std::vector> &spans, int tagging_scheme, std::vector *tags) const; - // Destroy the spans. - void DeleteSpans(std::vector *spans) const; protected: std::vector pos_; diff --git a/src/entity_recognizer/EntityInstanceNumeric.cpp b/src/entity_recognizer/EntityInstanceNumeric.cpp index 881c9db..d2cc9c4 100644 --- a/src/entity_recognizer/EntityInstanceNumeric.cpp +++ b/src/entity_recognizer/EntityInstanceNumeric.cpp @@ -36,8 +36,14 @@ void EntityInstanceNumeric::Initialize(const EntityDictionary &dictionary, if (id < 0) id = TOKEN_UNKNOWN; pos_ids_[i] = id; - dictionary.GetWordGazetteerIds(instance->GetForm(i), + std::string form = instance->GetForm(i); + // Uncomment next 'if's to allow different-case occurences + // to map to the same entry. + if (!dictionary.gazetteer_case_sensitive()) { + std::transform(form.begin(), form.end(), form.begin(), ::tolower); + } + dictionary.GetWordGazetteerIds(form, &gazetteer_ids_[i]); //LOG(INFO) << instance->GetForm(i) << ": " << gazetteer_ids_[i].size(); } -} +} diff --git a/src/entity_recognizer/EntityOptions.cpp b/src/entity_recognizer/EntityOptions.cpp index a908c15..c330cb3 100644 --- a/src/entity_recognizer/EntityOptions.cpp +++ b/src/entity_recognizer/EntityOptions.cpp @@ -1,76 +1,100 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "EntityOptions.h" -#include "SerializationUtils.h" -#include - -using namespace std; - -// TODO: Implement the reader for "text". -DEFINE_string(entity_file_format, "conll", - "Format of the input file containing the data. Use ""conll"" for " - "the format used in CONLL-X, and ""text"" for tokenized" - "sentences (one per line, with tokens separated " - "by white-spaces."); -DEFINE_string(entity_tagging_scheme, "bio", - "The encoding scheme to represent entity spans as tags. Either " - """io"", ""bio"", or ""bilou""."); -DEFINE_string(entity_file_gazetteer, "", - "Path to a gazetteer file (one entity per line with the " - "corresponding class, separated by tabs."); - -// Save current option flags to the model file. -void EntityOptions::Save(FILE* fs) { - SequenceOptions::Save(fs); - - bool success; - success = WriteString(fs, tagging_scheme_name_); - CHECK(success); -} - -// Load current option flags to the model file. -// Note: this will override the user-specified flags. -void EntityOptions::Load(FILE* fs) { - SequenceOptions::Load(fs); - - bool success; - success = ReadString(fs, &FLAGS_entity_tagging_scheme); - CHECK(success); - LOG(INFO) << "Setting --entity_tagging_scheme=" << - FLAGS_entity_tagging_scheme; - - Initialize(); -} - -void EntityOptions::Initialize() { - SequenceOptions::Initialize(); - - file_format_ = FLAGS_entity_file_format; - file_gazetteer_ = FLAGS_entity_file_gazetteer; - tagging_scheme_name_ = FLAGS_entity_tagging_scheme; - if (tagging_scheme_name_ == "io") { - tagging_scheme_ = EntityTaggingSchemes::IO; - } else if (tagging_scheme_name_ == "bio") { - tagging_scheme_ = EntityTaggingSchemes::BIO; - } else if (tagging_scheme_name_ == "bilou") { - tagging_scheme_ = EntityTaggingSchemes::BILOU; - } else { - CHECK(false) << "Unknown entity scheme: " << tagging_scheme_name_; - } -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "EntityOptions.h" +#include "Pipe.h" +#include "SerializationUtils.h" +#include + +using namespace std; + +// TODO: Implement the reader for "text". +DEFINE_string(entity_file_format, "conll", + "Format of the input file containing the data. Use ""conll"" for " + "the format used in CONLL-X, and ""text"" for tokenized" + "sentences (one per line, with tokens separated " + "by white-spaces."); +DEFINE_string(entity_tagging_scheme, "bio", + "The encoding scheme to represent entity spans as tags. Either " + """io"", ""bio"", or ""bilou""."); +DEFINE_string(entity_file_gazetteer, "", + "Path to a gazetteer file (one entity per line with the " + "corresponding class, separated by tabs."); +DEFINE_int32(entity_recognizer_large_feature_set, 3, + "The greater the value, the larger feature set used. Taggers are " + "usually more accurate but slower and have a larger memory footprint."); +DEFINE_bool(entity_gazetteer_case_sensitive, true, + "Distinguish upper/lower case of gazetteers words"); + +// Save current option flags to the model file. +void EntityOptions::Save(FILE* fs) { + SequenceOptions::Save(fs); + + bool success; + success = WriteString(fs, tagging_scheme_name_); + CHECK(success); + success = WriteInteger(fs, large_feature_set_); + CHECK(success); + success = WriteBool(fs, gazetteer_case_sensitive_); + CHECK(success); +} + +// Load current option flags to the model file. +// Note: this will override the user-specified flags. +void EntityOptions::Load(FILE* fs) { + SequenceOptions::Load(fs); + + bool success; + + success = ReadString(fs, &FLAGS_entity_tagging_scheme); + CHECK(success); + LOG(INFO) << "Setting --entity_tagging_scheme=" + << FLAGS_entity_tagging_scheme; + + if (pipe_ != nullptr && pipe_->GetModelVersion() >= 200030001) { + success = ReadInteger(fs, &FLAGS_entity_recognizer_large_feature_set); + CHECK(success); + LOG(INFO) << "Setting --entity_recognizer_large_feature_set=" + << FLAGS_entity_recognizer_large_feature_set; + + success = ReadBool(fs, &FLAGS_entity_gazetteer_case_sensitive); + CHECK(success); + LOG(INFO) << "Setting --entity_gazetteer_case_sensitive=" + << FLAGS_entity_gazetteer_case_sensitive; + } + Initialize(); +} + +void EntityOptions::Initialize() { + SequenceOptions::Initialize(); + + file_format_ = FLAGS_entity_file_format; + file_gazetteer_ = FLAGS_entity_file_gazetteer; + tagging_scheme_name_ = FLAGS_entity_tagging_scheme; + if (tagging_scheme_name_ == "io") { + tagging_scheme_ = EntityTaggingSchemes::IO; + } else if (tagging_scheme_name_ == "bio") { + tagging_scheme_ = EntityTaggingSchemes::BIO; + } else if (tagging_scheme_name_ == "bilou") { + tagging_scheme_ = EntityTaggingSchemes::BILOU; + } else { + CHECK(false) << "Unknown entity scheme: " << tagging_scheme_name_; + } + large_feature_set_ = FLAGS_entity_recognizer_large_feature_set; + gazetteer_case_sensitive_ = FLAGS_entity_gazetteer_case_sensitive; +} diff --git a/src/entity_recognizer/EntityOptions.h b/src/entity_recognizer/EntityOptions.h index 580c05e..1df6856 100644 --- a/src/entity_recognizer/EntityOptions.h +++ b/src/entity_recognizer/EntityOptions.h @@ -44,12 +44,16 @@ class EntityOptions : public SequenceOptions { // Get option flags. int tagging_scheme() { return tagging_scheme_; } const std::string &file_gazetteer() { return file_gazetteer_; } + const int large_feature_set() { return large_feature_set_; } + const bool gazetteer_case_sensitive() { return gazetteer_case_sensitive_; } protected: std::string file_format_; std::string tagging_scheme_name_; std::string file_gazetteer_; int tagging_scheme_; + int large_feature_set_; + bool gazetteer_case_sensitive_; }; #endif // ENTITY_OPTIONS_H_ diff --git a/src/entity_recognizer/EntityPipe.h b/src/entity_recognizer/EntityPipe.h index 0f62654..193719c 100644 --- a/src/entity_recognizer/EntityPipe.h +++ b/src/entity_recognizer/EntityPipe.h @@ -26,6 +26,7 @@ #include "EntityInstanceNumeric.h" #include "EntityWriter.h" #include "EntityFeatures.h" +#include "EntityDecoder.h" // by MLA class EntityPipe : public SequencePipe { public: @@ -59,6 +60,8 @@ class EntityPipe : public SequencePipe { static_cast(instance)); return instance_numeric; } + + void CreateDecoder() { decoder_ = new EntityDecoder(this); }; // by MLA protected: //void SaveModel(FILE* fs); diff --git a/src/entity_recognizer/EntityWriter.cpp b/src/entity_recognizer/EntityWriter.cpp index 57f4b25..87ea181 100644 --- a/src/entity_recognizer/EntityWriter.cpp +++ b/src/entity_recognizer/EntityWriter.cpp @@ -16,8 +16,10 @@ // You should have received a copy of the GNU Lesser General Public License // along with TurboParser 2.3. If not, see . +#include "EntityPipe.h" #include "EntityWriter.h" #include "EntityInstance.h" +#include "EntityInstanceNumeric.h" #include "EntityOptions.h" #include #include @@ -27,14 +29,13 @@ void EntityWriter::Write(Instance *instance) { static_cast(instance); // Always write in BIO format. - std::vector spans; + std::vector> spans; std::vector tags; entity_instance->CreateSpansFromTags(entity_instance->tags(), &spans); entity_instance->CreateTagsFromSpans(entity_instance->tags().size(), spans, EntityTaggingSchemes::BIO, &tags); - entity_instance->DeleteSpans(&spans); for (int i = 0; i < entity_instance->size(); ++i) { os_ << entity_instance->GetForm(i) << "\t"; @@ -43,3 +44,44 @@ void EntityWriter::Write(Instance *instance) { } os_ << endl; } + +void EntityWriter::WriteFormatted(Pipe * pipe, Instance *instance) { + if ((static_cast(pipe))->GetEntityOptions()->expose_node_edge_viterbi_scores()) { + EntityInstanceNumeric *entity_instance = + static_cast(instance); + + EntityPipe * entity_pipe = + static_cast(pipe); + + for (int i = 0; i < entity_instance->size(); ++i) { + for (int j = 0; j < entity_instance->node_scores_[i].GetNumStates(); ++j) { + if (j > 0) + os_formatted_ << "\t"; + os_formatted_ << + entity_pipe->GetEntityDictionary()->GetTagName(entity_instance->node_scores_[i].GetState(j)) << ":" << entity_instance->node_scores_[i].GetScore(j); + } + + os_formatted_ << "\t"; + if (i < entity_instance->size() - 1) { + for (int j = 0; j < entity_instance->edge_scores_[i].GetNumCurrentStates(); ++j) { + int tag_id = entity_instance->node_scores_[i + 1].GetState(j); + if (j > 0) + os_formatted_ << "\t"; + for (int k = 0; k < entity_instance->edge_scores_[i].GetNumPreviousStates(j); ++k) { + if (k > 0) + os_formatted_ << "\t"; + + int tag_left_id = entity_instance->node_scores_[i].GetState(entity_instance->edge_scores_[i].GetAllPreviousStateScores(j)[k].first); + os_formatted_ << + entity_pipe->GetEntityDictionary()->GetTagName(tag_left_id) << "->" << + entity_pipe->GetEntityDictionary()->GetTagName(tag_id) << ":" + << entity_instance->edge_scores_[i].GetAllPreviousStateScores(j)[k].second; + } + } + } + os_formatted_ << endl; + } + os_formatted_ << endl; + } +} + diff --git a/src/entity_recognizer/EntityWriter.h b/src/entity_recognizer/EntityWriter.h index b848ba4..beb33c1 100644 --- a/src/entity_recognizer/EntityWriter.h +++ b/src/entity_recognizer/EntityWriter.h @@ -30,6 +30,7 @@ class EntityWriter : public Writer { public: void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); }; #endif /* ENTITYWRITER_H_ */ diff --git a/src/entity_recognizer/Makefile.am b/src/entity_recognizer/Makefile.am index 121fd86..6edb80e 100644 --- a/src/entity_recognizer/Makefile.am +++ b/src/entity_recognizer/Makefile.am @@ -12,6 +12,7 @@ EntityInstance.cpp EntityInstance.h \ EntityInstanceNumeric.cpp EntityInstanceNumeric.h \ EntityReader.cpp EntityReader.h \ EntityWriter.cpp EntityWriter.h \ +EntityDecoder.cpp EntityDecoder.h \ EntitySpan.h \ EntityPipe.cpp EntityPipe.h \ $(SEQUENCE)/SequenceInstanceNumeric.cpp \ diff --git a/src/entity_recognizer/Makefile.in b/src/entity_recognizer/Makefile.in index 8d5fd2c..9e5352f 100644 --- a/src/entity_recognizer/Makefile.in +++ b/src/entity_recognizer/Makefile.in @@ -93,7 +93,7 @@ PROGRAMS = $(TurboEntityRecognizerprg_PROGRAMS) am_TurboEntityRecognizer_OBJECTS = EntityFeatures.$(OBJEXT) \ EntityOptions.$(OBJEXT) EntityDictionary.$(OBJEXT) \ EntityInstance.$(OBJEXT) EntityInstanceNumeric.$(OBJEXT) \ - EntityReader.$(OBJEXT) EntityWriter.$(OBJEXT) \ + EntityReader.$(OBJEXT) EntityWriter.$(OBJEXT) EntityDecoder.$(OBJEXT) \ EntityPipe.$(OBJEXT) SequenceInstanceNumeric.$(OBJEXT) \ SequenceWriter.$(OBJEXT) SequenceDecoder.$(OBJEXT) \ SequencePipe.$(OBJEXT) SequenceOptions.$(OBJEXT) \ @@ -284,6 +284,7 @@ EntityInstance.cpp EntityInstance.h \ EntityInstanceNumeric.cpp EntityInstanceNumeric.h \ EntityReader.cpp EntityReader.h \ EntityWriter.cpp EntityWriter.h \ +EntityDecoder.cpp EntityDecoder.h \ EntitySpan.h \ EntityPipe.cpp EntityPipe.h \ $(SEQUENCE)/SequenceInstanceNumeric.cpp \ @@ -412,6 +413,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/EntityPipe.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/EntityReader.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/EntityWriter.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/EntityDecoder.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Options.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Parameters.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Pipe.Po@am__quote@ diff --git a/src/entity_recognizer/TurboEntityRecognizer.cpp b/src/entity_recognizer/TurboEntityRecognizer.cpp index 2e7c986..6b1058d 100644 --- a/src/entity_recognizer/TurboEntityRecognizer.cpp +++ b/src/entity_recognizer/TurboEntityRecognizer.cpp @@ -1,87 +1,86 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "EntityPipe.h" - -using namespace std; - -void TrainEntityRecognizer(); -void TestEntityRecognizer(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - -#ifdef _WIN32 - google::LogToStderr(); -#endif - if (FLAGS_train) { - LOG(INFO) << "Training entity recognizer..." << endl; - TrainEntityRecognizer(); - } else if (FLAGS_test) { - LOG(INFO) << "Running entity recognizer..." << endl; - TestEntityRecognizer(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainEntityRecognizer() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - EntityOptions *options = new EntityOptions; - options->Initialize(); - - EntityPipe *pipe = new EntityPipe(options); - pipe->Initialize(); - pipe->Train(); - pipe->SaveModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; - - delete pipe; - delete options; -} - -void TestEntityRecognizer() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - EntityOptions *options = new EntityOptions; - options->Initialize(); - - EntityPipe *pipe = new EntityPipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - - pipe->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; - - delete pipe; - delete options; -} +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "EntityPipe.h" + +using namespace std; + +void TrainEntityRecognizer(); +void TestEntityRecognizer(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + +#ifdef _WIN32 + google::LogToStderr(); +#endif + if (FLAGS_train) { + LOG(INFO) << "Training entity recognizer..." << endl; + TrainEntityRecognizer(); + } else if (FLAGS_test) { + LOG(INFO) << "Running entity recognizer..." << endl; + TestEntityRecognizer(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainEntityRecognizer() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + EntityOptions *options = new EntityOptions; + options->Initialize(); + + EntityPipe *pipe = new EntityPipe(options); + pipe->Initialize(); + pipe->Train(); + LOG(INFO) << "\n FINAL model: " << options->GetModelFilePath() << "\n"; + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestEntityRecognizer() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + EntityOptions *options = new EntityOptions; + options->Initialize(); + + EntityPipe *pipe = new EntityPipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/morphological_tagger/MorphologicalDictionary.cpp b/src/morphological_tagger/MorphologicalDictionary.cpp index 9312bca..f3cdb41 100644 --- a/src/morphological_tagger/MorphologicalDictionary.cpp +++ b/src/morphological_tagger/MorphologicalDictionary.cpp @@ -133,8 +133,8 @@ void MorphologicalTokenDictionary::Initialize(MorphologicalReader *reader) { std::string form = instance->GetForm(i); int form_length = static_cast(form.length()); std::string form_lower(form); - transform(form_lower.begin(), form_lower.end(), - form_lower.begin(), ::tolower); + std::transform(form_lower.begin(), form_lower.end(), + form_lower.begin(), ::tolower); if (!form_case_sensitive) form = form_lower; id = form_alphabet.Insert(form); if (id >= form_freqs.size()) { @@ -345,4 +345,4 @@ void MorphologicalTokenDictionary::Initialize(MorphologicalReader *reader) { //CHECK_LT(feats_alphabet_.size(), 0xffff); CHECK_LT(pos_alphabet_.size(), 0xff); CHECK_LT(cpos_alphabet_.size(), 0xff); -} +} diff --git a/src/morphological_tagger/MorphologicalDictionary.h b/src/morphological_tagger/MorphologicalDictionary.h index 5d5332e..c0f9c15 100644 --- a/src/morphological_tagger/MorphologicalDictionary.h +++ b/src/morphological_tagger/MorphologicalDictionary.h @@ -1,118 +1,118 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef MORPHOLOGICALDICTIONARY_H_ -#define MORPHOLOGICALDICTIONARY_H_ - -#include "SequenceDictionary.h" -#include "TokenDictionary.h" -#include "MorphologicalReader.h" - -class MorphologicalDictionary : public SequenceDictionary { -public: - MorphologicalDictionary() {} - MorphologicalDictionary(Pipe* pipe) : SequenceDictionary(pipe) {} - virtual ~MorphologicalDictionary() {} - - void Clear() { - SequenceDictionary::Clear(); - cpostag_morphologicaltags_.clear(); - } - - void Save(FILE *fs) { - SequenceDictionary::Save(fs); - bool success; - int length = unknown_cpostag_morphologicaltags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < unknown_cpostag_morphologicaltags_.size(); ++j) { - int tag = unknown_cpostag_morphologicaltags_[j]; - success = WriteInteger(fs, tag); - CHECK(success); - } - - length = cpostag_morphologicaltags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int i = 0; i < cpostag_morphologicaltags_.size(); ++i) { - length = cpostag_morphologicaltags_[i].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < cpostag_morphologicaltags_[i].size(); ++j) { - int tag = cpostag_morphologicaltags_[i][j]; - success = WriteInteger(fs, tag); - CHECK(success); - } - } - } - - void Load(FILE *fs) { - SequenceDictionary::Load(fs); - bool success; - int length; - success = ReadInteger(fs, &length); - CHECK(success); - unknown_cpostag_morphologicaltags_.resize(length); - for (int j = 0; j < unknown_cpostag_morphologicaltags_.size(); ++j) { - int tag; - success = ReadInteger(fs, &tag); - CHECK(success); - unknown_cpostag_morphologicaltags_[j] = tag; - } - success = ReadInteger(fs, &length); - CHECK(success); - cpostag_morphologicaltags_.resize(length); - for (int i = 0; i < cpostag_morphologicaltags_.size(); ++i) { - success = ReadInteger(fs, &length); - CHECK(success); - cpostag_morphologicaltags_[i].resize(length); - for (int j = 0; j < cpostag_morphologicaltags_[i].size(); ++j) { - int tag; - success = ReadInteger(fs, &tag); - CHECK(success); - cpostag_morphologicaltags_[i][j] = tag; - } - } - } - - void CreateTagDictionary(MorphologicalReader *reader); - - const std::vector &GetAllowedMorphologicalTags(int cpostag) { - // return cpostag_morphologicaltags_[cpostag]; - // TODO: Not sure is this should be done here... - // It may be cleaner to return an empty vector here and - // fill it with the unknown tags elsewhere. - if (!cpostag_morphologicaltags_[cpostag].empty()) { - return cpostag_morphologicaltags_[cpostag]; - } else { - return unknown_cpostag_morphologicaltags_; - } - } - -protected: - std::vector > cpostag_morphologicaltags_; - std::vector unknown_cpostag_morphologicaltags_; -}; - -class MorphologicalTokenDictionary : public TokenDictionary { -public: - MorphologicalTokenDictionary() {}; - virtual ~MorphologicalTokenDictionary() {}; - void Initialize(MorphologicalReader *reader); -}; -#endif /* MORPHOLOGICALDICTIONARY_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef MORPHOLOGICALDICTIONARY_H_ +#define MORPHOLOGICALDICTIONARY_H_ + +#include "SequenceDictionary.h" +#include "TokenDictionary.h" +#include "MorphologicalReader.h" + +class MorphologicalDictionary : public SequenceDictionary { +public: + MorphologicalDictionary() {} + MorphologicalDictionary(Pipe* pipe) : SequenceDictionary(pipe) {} + virtual ~MorphologicalDictionary() {} + + void Clear() { + SequenceDictionary::Clear(); + cpostag_morphologicaltags_.clear(); + } + + void Save(FILE *fs) { + SequenceDictionary::Save(fs); + bool success; + int length = (int) unknown_cpostag_morphologicaltags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < unknown_cpostag_morphologicaltags_.size(); ++j) { + int tag = unknown_cpostag_morphologicaltags_[j]; + success = WriteInteger(fs, tag); + CHECK(success); + } + + length = (int)cpostag_morphologicaltags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int i = 0; i < cpostag_morphologicaltags_.size(); ++i) { + length = (int)cpostag_morphologicaltags_[i].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < cpostag_morphologicaltags_[i].size(); ++j) { + int tag = cpostag_morphologicaltags_[i][j]; + success = WriteInteger(fs, tag); + CHECK(success); + } + } + } + + void Load(FILE *fs) { + SequenceDictionary::Load(fs); + bool success; + int length; + success = ReadInteger(fs, &length); + CHECK(success); + unknown_cpostag_morphologicaltags_.resize(length); + for (int j = 0; j < unknown_cpostag_morphologicaltags_.size(); ++j) { + int tag; + success = ReadInteger(fs, &tag); + CHECK(success); + unknown_cpostag_morphologicaltags_[j] = tag; + } + success = ReadInteger(fs, &length); + CHECK(success); + cpostag_morphologicaltags_.resize(length); + for (int i = 0; i < cpostag_morphologicaltags_.size(); ++i) { + success = ReadInteger(fs, &length); + CHECK(success); + cpostag_morphologicaltags_[i].resize(length); + for (int j = 0; j < cpostag_morphologicaltags_[i].size(); ++j) { + int tag; + success = ReadInteger(fs, &tag); + CHECK(success); + cpostag_morphologicaltags_[i][j] = tag; + } + } + } + + void CreateTagDictionary(MorphologicalReader *reader); + + const std::vector &GetAllowedMorphologicalTags(int cpostag) { + // return cpostag_morphologicaltags_[cpostag]; + // TODO: Not sure is this should be done here... + // It may be cleaner to return an empty vector here and + // fill it with the unknown tags elsewhere. + if (!cpostag_morphologicaltags_[cpostag].empty()) { + return cpostag_morphologicaltags_[cpostag]; + } else { + return unknown_cpostag_morphologicaltags_; + } + } + +protected: + std::vector > cpostag_morphologicaltags_; + std::vector unknown_cpostag_morphologicaltags_; +}; + +class MorphologicalTokenDictionary : public TokenDictionary { +public: + MorphologicalTokenDictionary() {}; + virtual ~MorphologicalTokenDictionary() {}; + void Initialize(MorphologicalReader *reader); +}; +#endif /* MORPHOLOGICALDICTIONARY_H_ */ diff --git a/src/morphological_tagger/MorphologicalFeatureTemplates.h b/src/morphological_tagger/MorphologicalFeatureTemplates.h index 1272f80..c5f1a55 100644 --- a/src/morphological_tagger/MorphologicalFeatureTemplates.h +++ b/src/morphological_tagger/MorphologicalFeatureTemplates.h @@ -52,14 +52,14 @@ struct MorphologicalFeatureTemplateUnigram { P, /* POS */ // 'cpostag' pP, /* POS on the left */ // 'cpostag_minusone' - nP, /* POS on the right */ // 'cpostag_plusone' + nP, /* POS on the right */ // 'cpostag_plusone' ppP, /* POS two positions on the left */ // 'cpostag_minustwo' - nnP, /* POS two positions on the right */ // 'cpostag_plustwo' + nnP, /* POS two positions on the right */ // 'cpostag_plustwo' PpP, /* POS + POS on the left */ // 'cpostag_bigram' - PnP, /* POS + POS on the right */ // + PnP, /* POS + POS on the right */ // PpPppP, /* POS trigram on the left */ // 'cpostag_trigram' - PnPnnP, /* POS trigram on the right */ // - PpPnP, /* POS trigram on the center */ // + PnPnnP, /* POS trigram on the right */ // + PpPnP, /* POS trigram on the center */ // S, /* shape */ pS, /* shape on the left */ diff --git a/src/morphological_tagger/MorphologicalWriter.cpp b/src/morphological_tagger/MorphologicalWriter.cpp index 9119a44..168128b 100644 --- a/src/morphological_tagger/MorphologicalWriter.cpp +++ b/src/morphological_tagger/MorphologicalWriter.cpp @@ -40,3 +40,5 @@ void MorphologicalWriter::Write(Instance *instance) { } os_ << endl; } + +void MorphologicalWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} \ No newline at end of file diff --git a/src/morphological_tagger/MorphologicalWriter.h b/src/morphological_tagger/MorphologicalWriter.h index 6ba2fa1..a0e9e75 100644 --- a/src/morphological_tagger/MorphologicalWriter.h +++ b/src/morphological_tagger/MorphologicalWriter.h @@ -28,6 +28,7 @@ class MorphologicalWriter : public Writer { public: void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); }; #endif /* MORPHOLOGICALWRITER_H_ */ diff --git a/src/morphological_tagger/TurboMorphologicalTagger.cpp b/src/morphological_tagger/TurboMorphologicalTagger.cpp index 906c721..151316e 100644 --- a/src/morphological_tagger/TurboMorphologicalTagger.cpp +++ b/src/morphological_tagger/TurboMorphologicalTagger.cpp @@ -1,86 +1,84 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "MorphologicalPipe.h" - -using namespace std; - -void TrainMorphologicalTagger(); -void TestMorphologicalTagger(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - -#ifdef _WIN32 - google::LogToStderr(); -#endif - if (FLAGS_train) { - LOG(INFO) << "Training morphological tagger..." << endl; - TrainMorphologicalTagger(); - } else if (FLAGS_test) { - LOG(INFO) << "Running morphological tagger..." << endl; - TestMorphologicalTagger(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainMorphologicalTagger() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - MorphologicalOptions *options = new MorphologicalOptions; - options->Initialize(); - - MorphologicalPipe *pipe = new MorphologicalPipe(options); - pipe->Initialize(); - pipe->Train(); - pipe->SaveModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; - - delete pipe; - delete options; -} - -void TestMorphologicalTagger() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - MorphologicalOptions *options = new MorphologicalOptions; - options->Initialize(); - - MorphologicalPipe *pipe = new MorphologicalPipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - pipe->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; - - delete pipe; - delete options; -} +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "MorphologicalPipe.h" + +using namespace std; + +void TrainMorphologicalTagger(); +void TestMorphologicalTagger(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + +#ifdef _WIN32 + google::LogToStderr(); +#endif + if (FLAGS_train) { + LOG(INFO) << "Training morphological tagger..." << endl; + TrainMorphologicalTagger(); + } else if (FLAGS_test) { + LOG(INFO) << "Running morphological tagger..." << endl; + TestMorphologicalTagger(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainMorphologicalTagger() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + MorphologicalOptions *options = new MorphologicalOptions; + options->Initialize(); + + MorphologicalPipe *pipe = new MorphologicalPipe(options); + pipe->Initialize(); + pipe->Train(); + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestMorphologicalTagger() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + MorphologicalOptions *options = new MorphologicalOptions; + options->Initialize(); + + MorphologicalPipe *pipe = new MorphologicalPipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/parser/DependencyDecoder.cpp b/src/parser/DependencyDecoder.cpp index d2978bb..54e134a 100644 --- a/src/parser/DependencyDecoder.cpp +++ b/src/parser/DependencyDecoder.cpp @@ -23,7 +23,8 @@ #include "FactorHeadAutomaton.h" #include "FactorGrandparentHeadAutomaton.h" #include "FactorTrigramHeadAutomaton.h" -#include "FactorSequence.h" +#include "FactorSequence.h" +#include "TimeUtils.h" #include "AlgUtils.h" #include #include @@ -1087,7 +1088,7 @@ void DependencyDecoder::DecodeInsideOutside(Instance *instance, Parts *parts, } for (int m = 1; m < sentence_length; ++m) { LOG(INFO) << "Sum " << m << " = " << sum[m]; - } +} #endif } @@ -1292,14 +1293,14 @@ void DependencyDecoder::DecodeFactorGraph(Instance *instance, Parts *parts, int m = arc->modifier(); graph_paths[h][m] = true; } - timeval start, end; - gettimeofday(&start, NULL); + chronowrap::Chronometer chrono; + chrono.GetTime(); ComputeTransitiveClosure(&graph_paths); for (int i = 0; i < sentence->size(); ++i) { graph_paths[i][i] = true; } - gettimeofday(&end, NULL); - double elapsed_time_paths = diff_ms(end, start); + chrono.StopTime(); + double elapsed_time_paths = chrono.GetElapsedTime(); int num_possible_paths = 0; for (int i = 0; i < sentence->size(); ++i) { for (int j = 0; j < sentence->size(); ++j) { @@ -1309,7 +1310,7 @@ void DependencyDecoder::DecodeFactorGraph(Instance *instance, Parts *parts, VLOG(2) << num_arcs << " possible arcs and " << num_possible_paths << " possible paths in " << sentence->size() * sentence->size() - << " (took " << elapsed_time_paths << " ms.)"; + << " (took " << elapsed_time_paths << " sec.)"; } } @@ -2511,12 +2512,12 @@ void DependencyDecoder::DecodeFactorGraph(Instance *instance, Parts *parts, bool solved = false; if (add_evidence) { VLOG(2) << "Adding evidence..."; - timeval start, end; - gettimeofday(&start, NULL); + chronowrap::Chronometer chrono; + chrono.GetTime(); int status = factor_graph->AddEvidence(&evidence, &recomputed_indices); - gettimeofday(&end, NULL); - double elapsed_time = diff_ms(end, start); - VLOG(2) << "Graph simplification took " << elapsed_time << "ms."; + chrono.StopTime(); + double elapsed_time = chrono.GetElapsedTime(); + VLOG(2) << "Graph simplification took " << elapsed_time << "sec."; CHECK_NE(status, AD3::STATUS_INFEASIBLE); if (status == AD3::STATUS_OPTIMAL_INTEGER) solved = true; VLOG(2) << "Number of factors: " << factor_graph->GetNumFactors(); @@ -2552,14 +2553,14 @@ void DependencyDecoder::DecodeFactorGraph(Instance *instance, Parts *parts, //factor_graph->SetResidualThresholdAD3(1e-6); // Run AD3. - timeval start, end; - gettimeofday(&start, NULL); + chronowrap::Chronometer chrono; + chrono.GetTime(); if (!solved) { factor_graph->SolveLPMAPWithAD3(&posteriors, &additional_posteriors, value); } - gettimeofday(&end, NULL); - double elapsed_time = diff_ms(end, start); - VLOG(2) << "Elapsed time (AD3) = " << elapsed_time + chrono.StopTime(); + double elapsed_time = chrono.GetElapsedTime(); + VLOG(2) << "Elapsed time (AD3) = " << elapsed_time << " sec." << " (" << sentence->size() << ") "; delete factor_graph; @@ -3328,4 +3329,4 @@ void DependencyDecoder::DecodeCPLEX(Instance *instance, Parts *parts, } } -#endif +#endif diff --git a/src/parser/DependencyDictionary.cpp b/src/parser/DependencyDictionary.cpp index 584b2b8..390a70e 100644 --- a/src/parser/DependencyDictionary.cpp +++ b/src/parser/DependencyDictionary.cpp @@ -175,8 +175,8 @@ void DependencyTokenDictionary::Initialize(DependencyReader *reader) { // Add form to alphabet. std::string form = instance->GetForm(i); std::string form_lower(form); - transform(form_lower.begin(), form_lower.end(), - form_lower.begin(), ::tolower); + std::transform(form_lower.begin(), form_lower.end(), + form_lower.begin(), ::tolower); if (!form_case_sensitive) form = form_lower; id = form_alphabet.Insert(form); if (id >= form_freqs.size()) { @@ -372,6 +372,7 @@ void DependencyTokenDictionary::Initialize(DependencyReader *reader) { CHECK_LT(pos_alphabet_.size(), 0xff); CHECK_LT(cpos_alphabet_.size(), 0xff); - // TODO: Remove this (only for debugging purposes). +#ifndef NDEBUG BuildNames(); -} +#endif +} diff --git a/src/parser/DependencyDictionary.h b/src/parser/DependencyDictionary.h index 073a790..20a07d2 100644 --- a/src/parser/DependencyDictionary.h +++ b/src/parser/DependencyDictionary.h @@ -1,160 +1,160 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef DEPENDENCYDICTIONARY_H_ -#define DEPENDENCYDICTIONARY_H_ - -#include "Dictionary.h" -#include "TokenDictionary.h" -#include "DependencyReader.h" -#include "SerializationUtils.h" - -class Pipe; - -class DependencyDictionary : public Dictionary { -public: - DependencyDictionary() { token_dictionary_ = NULL; } - DependencyDictionary(Pipe* pipe) : pipe_(pipe) {} - virtual ~DependencyDictionary() { - Clear(); - } - - void CreateLabelDictionary(DependencyReader *reader); - - void Clear() { - // Don't clear token_dictionary, since this class does not own it. - label_alphabet_.clear(); - existing_labels_.clear(); - maximum_left_distances_.clear(); - maximum_right_distances_.clear(); - } - - void BuildLabelNames() { - label_alphabet_.BuildNames(); - } - - const string &GetLabelName(int label) const { - return label_alphabet_.GetName(label); - } - - void AllowGrowth() { token_dictionary_->AllowGrowth(); } - void StopGrowth() { token_dictionary_->StopGrowth(); } - - void Save(FILE *fs) { - if (0 > label_alphabet_.Save(fs)) CHECK(false); - bool success; - int length = existing_labels_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int i = 0; i < existing_labels_.size(); ++i) { - length = existing_labels_[i].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < existing_labels_[i].size(); ++j) { - length = existing_labels_[i][j].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int k = 0; k < existing_labels_[i][j].size(); ++k) { - int label = existing_labels_[i][j][k]; - success = WriteInteger(fs, label); - CHECK(success); - } - int distance; - distance = maximum_left_distances_[i][j]; - success = WriteInteger(fs, distance); - CHECK(success); - distance = maximum_right_distances_[i][j]; - success = WriteInteger(fs, distance); - CHECK(success); - } - } - } - - void Load(FILE *fs) { - if (0 > label_alphabet_.Load(fs)) CHECK(false); - bool success; - int length; - success = ReadInteger(fs, &length); - CHECK(success); - existing_labels_.resize(length); - maximum_left_distances_.resize(length); - maximum_right_distances_.resize(length); - for (int i = 0; i < existing_labels_.size(); ++i) { - success = ReadInteger(fs, &length); - CHECK(success); - existing_labels_[i].resize(length); - maximum_left_distances_[i].resize(length); - maximum_right_distances_[i].resize(length); - for (int j = 0; j < existing_labels_[i].size(); ++j) { - success = ReadInteger(fs, &length); - CHECK(success); - existing_labels_[i][j].resize(length); - for (int k = 0; k < existing_labels_[i][j].size(); ++k) { - int label; - success = ReadInteger(fs, &label); - CHECK(success); - existing_labels_[i][j][k] = label; - } - int distance; - success = ReadInteger(fs, &distance); - CHECK(success); - maximum_left_distances_[i][j] = distance; - success = ReadInteger(fs, &distance); - CHECK(success); - maximum_right_distances_[i][j] = distance; - } - } - BuildLabelNames(); - } - - TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } - void SetTokenDictionary(TokenDictionary *token_dictionary) { - token_dictionary_ = token_dictionary; - //CHECK(token_dictionary_ == NULL); - } - - const vector &GetExistingLabels(int modifier_pos_id, int head_pos_id) { - return existing_labels_[modifier_pos_id][head_pos_id]; - } - - int GetMaximumLeftDistance(int modifier_pos_id, int head_pos_id) { - return maximum_left_distances_[modifier_pos_id][head_pos_id]; - } - - int GetMaximumRightDistance(int modifier_pos_id, int head_pos_id) { - return maximum_right_distances_[modifier_pos_id][head_pos_id]; - } - - const Alphabet &GetLabelAlphabet() const { return label_alphabet_; }; - -protected: - Pipe *pipe_; - TokenDictionary *token_dictionary_; - Alphabet label_alphabet_; - vector > > existing_labels_; - vector > maximum_left_distances_; - vector > maximum_right_distances_; -}; - -class DependencyTokenDictionary : public TokenDictionary { -public: - DependencyTokenDictionary() {}; - virtual ~DependencyTokenDictionary() {}; - void Initialize(DependencyReader *reader); -}; -#endif /* DEPENDENCYDICTIONARY_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef DEPENDENCYDICTIONARY_H_ +#define DEPENDENCYDICTIONARY_H_ + +#include "Dictionary.h" +#include "TokenDictionary.h" +#include "DependencyReader.h" +#include "SerializationUtils.h" + +class Pipe; + +class DependencyDictionary : public Dictionary { +public: + DependencyDictionary() { token_dictionary_ = NULL; } + DependencyDictionary(Pipe* pipe) : pipe_(pipe) {} + virtual ~DependencyDictionary() { + Clear(); + } + + void CreateLabelDictionary(DependencyReader *reader); + + void Clear() { + // Don't clear token_dictionary, since this class does not own it. + label_alphabet_.clear(); + existing_labels_.clear(); + maximum_left_distances_.clear(); + maximum_right_distances_.clear(); + } + + void BuildLabelNames() { + label_alphabet_.BuildNames(); + } + + const string &GetLabelName(int label) const { + return label_alphabet_.GetName(label); + } + + void AllowGrowth() { token_dictionary_->AllowGrowth(); } + void StopGrowth() { token_dictionary_->StopGrowth(); } + + void Save(FILE *fs) { + if (0 > label_alphabet_.Save(fs)) CHECK(false); + bool success; + int length = (int)existing_labels_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int i = 0; i < existing_labels_.size(); ++i) { + length = (int)existing_labels_[i].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < existing_labels_[i].size(); ++j) { + length = (int)existing_labels_[i][j].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int k = 0; k < existing_labels_[i][j].size(); ++k) { + int label = existing_labels_[i][j][k]; + success = WriteInteger(fs, label); + CHECK(success); + } + int distance; + distance = maximum_left_distances_[i][j]; + success = WriteInteger(fs, distance); + CHECK(success); + distance = maximum_right_distances_[i][j]; + success = WriteInteger(fs, distance); + CHECK(success); + } + } + } + + void Load(FILE *fs) { + if (0 > label_alphabet_.Load(fs)) CHECK(false); + bool success; + int length; + success = ReadInteger(fs, &length); + CHECK(success); + existing_labels_.resize(length); + maximum_left_distances_.resize(length); + maximum_right_distances_.resize(length); + for (int i = 0; i < existing_labels_.size(); ++i) { + success = ReadInteger(fs, &length); + CHECK(success); + existing_labels_[i].resize(length); + maximum_left_distances_[i].resize(length); + maximum_right_distances_[i].resize(length); + for (int j = 0; j < existing_labels_[i].size(); ++j) { + success = ReadInteger(fs, &length); + CHECK(success); + existing_labels_[i][j].resize(length); + for (int k = 0; k < existing_labels_[i][j].size(); ++k) { + int label; + success = ReadInteger(fs, &label); + CHECK(success); + existing_labels_[i][j][k] = label; + } + int distance; + success = ReadInteger(fs, &distance); + CHECK(success); + maximum_left_distances_[i][j] = distance; + success = ReadInteger(fs, &distance); + CHECK(success); + maximum_right_distances_[i][j] = distance; + } + } + BuildLabelNames(); + } + + TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } + void SetTokenDictionary(TokenDictionary *token_dictionary) { + token_dictionary_ = token_dictionary; + //CHECK(token_dictionary_ == NULL); + } + + const vector &GetExistingLabels(int modifier_pos_id, int head_pos_id) { + return existing_labels_[modifier_pos_id][head_pos_id]; + } + + int GetMaximumLeftDistance(int modifier_pos_id, int head_pos_id) { + return maximum_left_distances_[modifier_pos_id][head_pos_id]; + } + + int GetMaximumRightDistance(int modifier_pos_id, int head_pos_id) { + return maximum_right_distances_[modifier_pos_id][head_pos_id]; + } + + const Alphabet &GetLabelAlphabet() const { return label_alphabet_; }; + +protected: + Pipe *pipe_; + TokenDictionary *token_dictionary_; + Alphabet label_alphabet_; + vector > > existing_labels_; + vector > maximum_left_distances_; + vector > maximum_right_distances_; +}; + +class DependencyTokenDictionary : public TokenDictionary { +public: + DependencyTokenDictionary() {}; + virtual ~DependencyTokenDictionary() {}; + void Initialize(DependencyReader *reader); +}; +#endif /* DEPENDENCYDICTIONARY_H_ */ diff --git a/src/parser/DependencyFeatures.h b/src/parser/DependencyFeatures.h index b575458..20bf226 100644 --- a/src/parser/DependencyFeatures.h +++ b/src/parser/DependencyFeatures.h @@ -1,172 +1,172 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef DEPENDENCYFEATURES_H_ -#define DEPENDENCYFEATURES_H_ - -#include "Features.h" -#include "DependencyInstanceNumeric.h" -#include "FeatureEncoder.h" - -class DependencyOptions; - -// This class implements the features for dependency parsing. -// The feature templates are largely inspired by the ones used in MSTParser -// (http://sourceforge.net/projects/mstparser/) and egstra -// (http://groups.csail.mit.edu/nlp/egstra/), which are described in -// the following papers: -// -// Non-Projective Dependency Parsing using Spanning Tree Algorithms. -// R. McDonald, F. Pereira, K. Ribarov and J. Hajič. -// Human Language Technologies and Empirical Methods in Natural Language -// Processing (HLT-EMNLP), 2005. -// -// M. Collins, A. Glober­son, T. Koo, X. Car­reras, and P. Bartlett. -// Ex­po­nen­ti­at­ed Gra­di­ent Al­go­rithms for Con­di­tion­al Ran­dom Fields and -// Max-​Mar­gin Markov Net­works. -// Jour­nal of Ma­chine Learn­ing Re­search 9(Aug):​1775–1822, 2008. - -class DependencyFeatures : public Features { -public: - DependencyFeatures() {}; - DependencyFeatures(Pipe* pipe) { pipe_ = pipe; } - virtual ~DependencyFeatures() { Clear(); } - -public: - void Clear() { - for (int r = 0; r < input_features_.size(); ++r) { - if (!input_features_[r]) continue; - input_features_[r]->clear(); - delete input_features_[r]; - input_features_[r] = NULL; - } - input_features_.clear(); - } - - void Initialize(Instance *instance, Parts *parts) { - Clear(); - input_features_.resize(parts->size(), static_cast(NULL)); - } - - int GetNumPartFeatures(int r) const { - return (NULL == input_features_[r]) ? 0 : input_features_[r]->size(); - }; - - int GetPartFeature(int r, int j) const { - return (*input_features_[r])[j]; - } - - const BinaryFeatures &GetPartFeatures(int r) const { - return *(input_features_[r]); - }; - - BinaryFeatures *GetMutablePartFeatures(int r) const { - return input_features_[r]; - }; - -public: - void AddArcFeaturesLight(DependencyInstanceNumeric *sentence, - int r, - int head, - int modifier); - - void AddArcFeatures(DependencyInstanceNumeric *sentence, - int r, - int head, - int modifier); - - void AddArbitrarySiblingFeatures(DependencyInstanceNumeric* sentence, - int r, - int head, - int modifier, - int sibling); - - void AddConsecutiveSiblingFeatures(DependencyInstanceNumeric* sentence, - int r, - int head, - int modifier, - int sibling); - - void AddSiblingFeatures(DependencyInstanceNumeric* sentence, - int r, - int head, - int modifier, - int sibling, - bool consecutive); - - void AddGrandparentFeatures(DependencyInstanceNumeric* sentence, - int r, - int grandparent, - int head, - int modifier); - - void AddGrandSiblingFeatures(DependencyInstanceNumeric* sentence, - int r, - int grandparent, - int head, - int modifier, - int sibling); - - void AddTriSiblingFeatures(DependencyInstanceNumeric* sentence, - int r, - int head, - int modifier, - int sibling, - int other_sibling); - - void AddNonprojectiveArcFeatures(DependencyInstanceNumeric* sentence, - int r, - int head, - int modifier); - - void AddDirectedPathFeatures(DependencyInstanceNumeric* sentence, - int r, - int ancestor, - int descendant); - - void AddHeadBigramFeatures(DependencyInstanceNumeric* sentence, - int r, - int head, - int modifier, - int previous_head); - -protected: - void AddWordPairFeatures(DependencyInstanceNumeric* sentence, - int pair_type, - int head, - int modifier, - bool use_lemma_features, - bool use_morphological_features, - BinaryFeatures *features); - - void AddWordPairFeaturesMST(DependencyInstanceNumeric* sentence, - int pair_type, - int head, - int modifier, - BinaryFeatures *features); - - void AddFeature(uint64_t fkey, BinaryFeatures* features) { - features->push_back(fkey); - } - -protected: - vector input_features_; // Vector of input features. - FeatureEncoder encoder_; // Encoder that converts features into a codeword. -}; - -#endif /* DEPENDENCYFEATURES_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef DEPENDENCYFEATURES_H_ +#define DEPENDENCYFEATURES_H_ + +#include "Features.h" +#include "DependencyInstanceNumeric.h" +#include "FeatureEncoder.h" + +class DependencyOptions; + +// This class implements the features for dependency parsing. +// The feature templates are largely inspired by the ones used in MSTParser +// (http://sourceforge.net/projects/mstparser/) and egstra +// (http://groups.csail.mit.edu/nlp/egstra/), which are described in +// the following papers: +// +// Non-Projective Dependency Parsing using Spanning Tree Algorithms. +// R. McDonald, F. Pereira, K. Ribarov and J. Hajič. +// Human Language Technologies and Empirical Methods in Natural Language +// Processing (HLT-EMNLP), 2005. +// +// M. Collins, A. Glober­son, T. Koo, X. Car­reras, and P. Bartlett. +// Ex­po­nen­ti­at­ed Gra­di­ent Al­go­rithms for Con­di­tion­al Ran­dom Fields and +// Max-​Mar­gin Markov Net­works. +// Jour­nal of Ma­chine Learn­ing Re­search 9(Aug):​1775–1822, 2008. + +class DependencyFeatures : public Features { +public: + DependencyFeatures() {}; + DependencyFeatures(Pipe* pipe) { pipe_ = pipe; } + virtual ~DependencyFeatures() { Clear(); } + +public: + void Clear() { + for (int r = 0; r < input_features_.size(); ++r) { + if (!input_features_[r]) continue; + input_features_[r]->clear(); + delete input_features_[r]; + input_features_[r] = NULL; + } + input_features_.clear(); + } + + void Initialize(Instance *instance, Parts *parts) { + Clear(); + input_features_.resize(parts->size(), static_cast(NULL)); + } + + int GetNumPartFeatures(int r) const { + return (NULL == input_features_[r]) ? 0 : (int)(input_features_[r]->size()); + }; + + int GetPartFeature(int r, int j) const { + return (*input_features_[r])[j]; + } + + const BinaryFeatures &GetPartFeatures(int r) const { + return *(input_features_[r]); + }; + + BinaryFeatures *GetMutablePartFeatures(int r) const { + return input_features_[r]; + }; + +public: + void AddArcFeaturesLight(DependencyInstanceNumeric *sentence, + int r, + int head, + int modifier); + + void AddArcFeatures(DependencyInstanceNumeric *sentence, + int r, + int head, + int modifier); + + void AddArbitrarySiblingFeatures(DependencyInstanceNumeric* sentence, + int r, + int head, + int modifier, + int sibling); + + void AddConsecutiveSiblingFeatures(DependencyInstanceNumeric* sentence, + int r, + int head, + int modifier, + int sibling); + + void AddSiblingFeatures(DependencyInstanceNumeric* sentence, + int r, + int head, + int modifier, + int sibling, + bool consecutive); + + void AddGrandparentFeatures(DependencyInstanceNumeric* sentence, + int r, + int grandparent, + int head, + int modifier); + + void AddGrandSiblingFeatures(DependencyInstanceNumeric* sentence, + int r, + int grandparent, + int head, + int modifier, + int sibling); + + void AddTriSiblingFeatures(DependencyInstanceNumeric* sentence, + int r, + int head, + int modifier, + int sibling, + int other_sibling); + + void AddNonprojectiveArcFeatures(DependencyInstanceNumeric* sentence, + int r, + int head, + int modifier); + + void AddDirectedPathFeatures(DependencyInstanceNumeric* sentence, + int r, + int ancestor, + int descendant); + + void AddHeadBigramFeatures(DependencyInstanceNumeric* sentence, + int r, + int head, + int modifier, + int previous_head); + +protected: + void AddWordPairFeatures(DependencyInstanceNumeric* sentence, + int pair_type, + int head, + int modifier, + bool use_lemma_features, + bool use_morphological_features, + BinaryFeatures *features); + + void AddWordPairFeaturesMST(DependencyInstanceNumeric* sentence, + int pair_type, + int head, + int modifier, + BinaryFeatures *features); + + void AddFeature(uint64_t fkey, BinaryFeatures* features) { + features->push_back(fkey); + } + +protected: + vector input_features_; // Vector of input features. + FeatureEncoder encoder_; // Encoder that converts features into a codeword. +}; + +#endif /* DEPENDENCYFEATURES_H_ */ diff --git a/src/parser/DependencyInstance.h b/src/parser/DependencyInstance.h index c467d4d..3bd7f45 100644 --- a/src/parser/DependencyInstance.h +++ b/src/parser/DependencyInstance.h @@ -1,82 +1,82 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef DEPENDENCYINSTANCE_H_ -#define DEPENDENCYINSTANCE_H_ - -#include -#include -#include "Instance.h" - -class DependencyInstance : public Instance { -public: - DependencyInstance() {}; - virtual ~DependencyInstance() {}; - - Instance* Copy() { - DependencyInstance* instance = new DependencyInstance(); - instance->Initialize(forms_, lemmas_, cpostags_, postags_, - feats_, deprels_, heads_); - return static_cast(instance); - } - - void Initialize(const vector &forms, - const vector &lemmas, - const vector &cpos, - const vector &pos, - const vector > &feats, - const vector &deprels, - const vector &heads); - - int size() { return forms_.size(); }; - - const vector &GetHeads() { return heads_; } - const vector &GetDependencyRelations() { return deprels_; } - - const string &GetForm(int i) { return forms_[i]; }; - const string &GetLemma(int i) { return lemmas_[i]; }; - const string &GetCoarsePosTag(int i) { return cpostags_[i]; }; - const string &GetPosTag(int i) { return postags_[i]; }; - int GetNumMorphFeatures(int i) { return feats_[i].size(); }; - const string &GetMorphFeature(int i, int j) { return feats_[i][j]; }; - int GetHead(int i) { return heads_[i]; }; - const string &GetDependencyRelation(int i) { return deprels_[i]; }; - - void SetHead(int i, int head) { heads_[i] = head; } - void SetDependencyRelation(int i, const string &dependency_relation) { - deprels_[i] = dependency_relation; - } - -protected: - // FORM: the forms - usually words, like "thought" - vector forms_; - // LEMMA: the lemmas, or stems, e.g. "think" - vector lemmas_; - // COURSE-POS: the course part-of-speech tags, e.g."V" - vector cpostags_; - // FINE-POS: the fine-grained part-of-speech tags, e.g."VBD" - vector postags_; - // FEATURES: some features associated with the elements separated by "|", e.g. "PAST|3P" - vector > feats_; - // HEAD: the IDs of the heads for each element - vector heads_; - // DEPREL: the dependency relations, e.g. "SUBJ" - vector deprels_; -}; - -#endif /* DEPENDENCYINSTANCE_H_*/ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef DEPENDENCYINSTANCE_H_ +#define DEPENDENCYINSTANCE_H_ + +#include +#include +#include "Instance.h" + +class DependencyInstance : public Instance { +public: + DependencyInstance() {}; + virtual ~DependencyInstance() {}; + + Instance* Copy() { + DependencyInstance* instance = new DependencyInstance(); + instance->Initialize(forms_, lemmas_, cpostags_, postags_, + feats_, deprels_, heads_); + return static_cast(instance); + } + + void Initialize(const vector &forms, + const vector &lemmas, + const vector &cpos, + const vector &pos, + const vector > &feats, + const vector &deprels, + const vector &heads); + + int size() { return (int) forms_.size(); }; + + const vector &GetHeads() { return heads_; } + const vector &GetDependencyRelations() { return deprels_; } + + const string &GetForm(int i) { return forms_[i]; }; + const string &GetLemma(int i) { return lemmas_[i]; }; + const string &GetCoarsePosTag(int i) { return cpostags_[i]; }; + const string &GetPosTag(int i) { return postags_[i]; }; + int GetNumMorphFeatures(int i) { return (int)(feats_[i].size()); }; + const string &GetMorphFeature(int i, int j) { return feats_[i][j]; }; + int GetHead(int i) { return heads_[i]; }; + const string &GetDependencyRelation(int i) { return deprels_[i]; }; + + void SetHead(int i, int head) { heads_[i] = head; } + void SetDependencyRelation(int i, const string &dependency_relation) { + deprels_[i] = dependency_relation; + } + +protected: + // FORM: the forms - usually words, like "thought" + vector forms_; + // LEMMA: the lemmas, or stems, e.g. "think" + vector lemmas_; + // COURSE-POS: the course part-of-speech tags, e.g."V" + vector cpostags_; + // FINE-POS: the fine-grained part-of-speech tags, e.g."VBD" + vector postags_; + // FEATURES: some features associated with the elements separated by "|", e.g. "PAST|3P" + vector > feats_; + // HEAD: the IDs of the heads for each element + vector heads_; + // DEPREL: the dependency relations, e.g. "SUBJ" + vector deprels_; +}; + +#endif /* DEPENDENCYINSTANCE_H_*/ diff --git a/src/parser/DependencyInstanceNumeric.cpp b/src/parser/DependencyInstanceNumeric.cpp index 770a4f6..cdab4e8 100644 --- a/src/parser/DependencyInstanceNumeric.cpp +++ b/src/parser/DependencyInstanceNumeric.cpp @@ -53,8 +53,8 @@ void DependencyInstanceNumeric::Initialize( for (i = 0; i < length; i++) { std::string form = instance->GetForm(i); std::string form_lower(form); - transform(form_lower.begin(), form_lower.end(), form_lower.begin(), - ::tolower); + std::transform(form_lower.begin(), form_lower.end(), form_lower.begin(), + ::tolower); if (!form_case_sensitive) form = form_lower; id = token_dictionary->GetFormId(form); CHECK_LT(id, 0xffff); @@ -143,4 +143,4 @@ void DependencyInstanceNumeric::Initialize( relations_[i] = dictionary.GetLabelAlphabet().Lookup( instance->GetDependencyRelation(i)); } -} +} diff --git a/src/parser/DependencyInstanceNumeric.h b/src/parser/DependencyInstanceNumeric.h index 252f0b3..04d5805 100644 --- a/src/parser/DependencyInstanceNumeric.h +++ b/src/parser/DependencyInstanceNumeric.h @@ -1,155 +1,155 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef DEPENDENCYINSTANCENUMERIC_H_ -#define DEPENDENCYINSTANCENUMERIC_H_ - -#include -#include -#include "Dictionary.h" -#include "Instance.h" -#include "DependencyInstance.h" -#include "DependencyDictionary.h" - -using namespace std; - -class DependencyInstanceNumeric : public Instance { -public: - DependencyInstanceNumeric() {}; - virtual ~DependencyInstanceNumeric() { Clear(); }; - - Instance* Copy() { - CHECK(false) << "Not implemented."; - return NULL; - } - - int size() { return form_ids_.size(); }; - - void Clear() { - form_ids_.clear(); - form_lower_ids_.clear(); - lemma_ids_.clear(); - prefix_ids_.clear(); - suffix_ids_.clear(); - pos_ids_.clear(); - cpos_ids_.clear(); - for (int i = 0; i < feats_ids_.size(); i++) { - feats_ids_[i].clear(); - } - //shapes_.clear(); - is_noun_.clear(); - is_verb_.clear(); - is_punc_.clear(); - is_coord_.clear(); - heads_.clear(); - } - - void Initialize(const DependencyDictionary &dictionary, - DependencyInstance *instance); - - void GetAllAncestors(const std::vector &heads, - int descend, - std::vector* ancestors) const { - ancestors->clear(); - int h = heads[descend]; - while (h >= 0) { - ancestors->push_back(h); - h = heads[h]; - } - } - -#if 0 - // TODO(atm): this is repeated in other tasks. Should move some of these - // functions to a common class and use inheritance. - void GetWordShape(const string &word, string *shape) { - string type = ""; - char last = '\0'; - for (int i = 0; i < word.size(); ++i) { - if (word[i] >= 'A' && word[i] <= 'Z') { - if (last != 'A') { - type += 'A'; - last = 'A'; - } else if (type[type.size() - 1] != '+') { - type += '+'; - } - } else if (word[i] >= 'a' && word[i] <= 'z') { - if (last != 'a') { - type += 'a'; - last = 'a'; - } else if (type[type.size() - 1] != '+') { - type += '+'; - } - } else if (word[i] >= '0' && word[i] <= '9') { - if (last != '0') { - type += '0'; - last = '0'; - } else if (type[type.size() - 1] != '+') { - type += '+'; - last = '0'; - } - } else { - type += word[i]; - } - } - *shape = type; - } -#endif - - const vector &GetFormIds() const { return form_ids_; } - const vector &GetFormLowerIds() const { return form_lower_ids_; } - const vector &GetLemmaIds() const { return lemma_ids_; } - const vector &GetPosIds() const { return pos_ids_; } - const vector &GetCoarsePosIds() const { return cpos_ids_; } - const vector &GetHeads() const { return heads_; } - const vector &GetRelations() const { return relations_; } - - int GetFormId(int i) { return form_ids_[i]; }; - int GetFormLowerId(int i) { return form_lower_ids_[i]; }; - int GetLemmaId(int i) { return lemma_ids_[i]; }; - int GetPrefixId(int i) { return prefix_ids_[i]; }; - int GetSuffixId(int i) { return suffix_ids_[i]; }; - int GetPosId(int i) { return pos_ids_[i]; }; - int GetCoarsePosId(int i) { return cpos_ids_[i]; }; - int GetNumMorphFeatures(int i) { return feats_ids_[i].size(); }; - int GetMorphFeature(int i, int j) { return feats_ids_[i][j]; }; - bool IsNoun(int i) { return is_noun_[i]; }; - bool IsVerb(int i) { return is_verb_[i]; }; - bool IsPunctuation(int i) { return is_punc_[i]; }; - bool IsCoordination(int i) { return is_coord_[i]; }; - int GetHead(int i) { return heads_[i]; }; - int GetRelationId(int i) { return relations_[i]; }; - -protected: - vector form_ids_; - vector form_lower_ids_; - vector lemma_ids_; - vector prefix_ids_; - vector suffix_ids_; - vector pos_ids_; - vector cpos_ids_; - vector > feats_ids_; - //vector shapes_; - vector is_noun_; - vector is_verb_; - vector is_punc_; - vector is_coord_; - vector heads_; - vector relations_; -}; - -#endif /* DEPENDENCYINSTANCENUMERIC_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef DEPENDENCYINSTANCENUMERIC_H_ +#define DEPENDENCYINSTANCENUMERIC_H_ + +#include +#include +#include "Dictionary.h" +#include "Instance.h" +#include "DependencyInstance.h" +#include "DependencyDictionary.h" + +using namespace std; + +class DependencyInstanceNumeric : public Instance { +public: + DependencyInstanceNumeric() {}; + virtual ~DependencyInstanceNumeric() { Clear(); }; + + Instance* Copy() { + CHECK(false) << "Not implemented."; + return NULL; + } + + int size() { return (int)form_ids_.size(); }; + + void Clear() { + form_ids_.clear(); + form_lower_ids_.clear(); + lemma_ids_.clear(); + prefix_ids_.clear(); + suffix_ids_.clear(); + pos_ids_.clear(); + cpos_ids_.clear(); + for (int i = 0; i < feats_ids_.size(); i++) { + feats_ids_[i].clear(); + } + //shapes_.clear(); + is_noun_.clear(); + is_verb_.clear(); + is_punc_.clear(); + is_coord_.clear(); + heads_.clear(); + } + + void Initialize(const DependencyDictionary &dictionary, + DependencyInstance *instance); + + void GetAllAncestors(const std::vector &heads, + int descend, + std::vector* ancestors) const { + ancestors->clear(); + int h = heads[descend]; + while (h >= 0) { + ancestors->push_back(h); + h = heads[h]; + } + } + +#if 0 + // TODO(atm): this is repeated in other tasks. Should move some of these + // functions to a common class and use inheritance. + void GetWordShape(const string &word, string *shape) { + string type = ""; + char last = '\0'; + for (int i = 0; i < word.size(); ++i) { + if (word[i] >= 'A' && word[i] <= 'Z') { + if (last != 'A') { + type += 'A'; + last = 'A'; + } else if (type[type.size() - 1] != '+') { + type += '+'; + } + } else if (word[i] >= 'a' && word[i] <= 'z') { + if (last != 'a') { + type += 'a'; + last = 'a'; + } else if (type[type.size() - 1] != '+') { + type += '+'; + } + } else if (word[i] >= '0' && word[i] <= '9') { + if (last != '0') { + type += '0'; + last = '0'; + } else if (type[type.size() - 1] != '+') { + type += '+'; + last = '0'; + } + } else { + type += word[i]; + } + } + *shape = type; + } +#endif + + const vector &GetFormIds() const { return form_ids_; } + const vector &GetFormLowerIds() const { return form_lower_ids_; } + const vector &GetLemmaIds() const { return lemma_ids_; } + const vector &GetPosIds() const { return pos_ids_; } + const vector &GetCoarsePosIds() const { return cpos_ids_; } + const vector &GetHeads() const { return heads_; } + const vector &GetRelations() const { return relations_; } + + int GetFormId(int i) { return form_ids_[i]; }; + int GetFormLowerId(int i) { return form_lower_ids_[i]; }; + int GetLemmaId(int i) { return lemma_ids_[i]; }; + int GetPrefixId(int i) { return prefix_ids_[i]; }; + int GetSuffixId(int i) { return suffix_ids_[i]; }; + int GetPosId(int i) { return pos_ids_[i]; }; + int GetCoarsePosId(int i) { return cpos_ids_[i]; }; + int GetNumMorphFeatures(int i) { return (int) feats_ids_[i].size(); }; + int GetMorphFeature(int i, int j) { return feats_ids_[i][j]; }; + bool IsNoun(int i) { return is_noun_[i]; }; + bool IsVerb(int i) { return is_verb_[i]; }; + bool IsPunctuation(int i) { return is_punc_[i]; }; + bool IsCoordination(int i) { return is_coord_[i]; }; + int GetHead(int i) { return heads_[i]; }; + int GetRelationId(int i) { return relations_[i]; }; + +protected: + vector form_ids_; + vector form_lower_ids_; + vector lemma_ids_; + vector prefix_ids_; + vector suffix_ids_; + vector pos_ids_; + vector cpos_ids_; + vector > feats_ids_; + //vector shapes_; + vector is_noun_; + vector is_verb_; + vector is_punc_; + vector is_coord_; + vector heads_; + vector relations_; +}; + +#endif /* DEPENDENCYINSTANCENUMERIC_H_ */ diff --git a/src/parser/DependencyPart.h b/src/parser/DependencyPart.h index 6e82d88..e799b63 100644 --- a/src/parser/DependencyPart.h +++ b/src/parser/DependencyPart.h @@ -482,7 +482,7 @@ class DependencyParts : public Parts { void BuildOffsets() { for (int i = NUM_DEPENDENCYPARTS - 1; i >= 0; --i) { if (offsets_[i] < 0) { - offsets_[i] = (i == NUM_DEPENDENCYPARTS - 1) ? size() : offsets_[i + 1]; + offsets_[i] = (i == NUM_DEPENDENCYPARTS - 1) ? (int)size() : offsets_[i + 1]; } } }; @@ -554,7 +554,7 @@ class DependencyParts : public Parts { void GetOffset(int i, int *offset, int *size) const { *offset = offsets_[i]; *size = (i < NUM_DEPENDENCYPARTS - 1) ? offsets_[i + 1] - (*offset) : - DependencyParts::size() - (*offset); + (int)DependencyParts::size() - (*offset); } // Set offset from part index. diff --git a/src/parser/DependencyPipe.cpp b/src/parser/DependencyPipe.cpp index 204b160..18e3e4c 100644 --- a/src/parser/DependencyPipe.cpp +++ b/src/parser/DependencyPipe.cpp @@ -47,15 +47,13 @@ void DependencyPipe::SaveModel(FILE* fs) { void DependencyPipe::LoadModel(FILE* fs) { bool success; - uint64_t model_check; - uint64_t model_version; - success = ReadUINT64(fs, &model_check); + success = ReadUINT64(fs, &model_check_); CHECK(success); - CHECK_EQ(model_check, kParserModelCheck) + CHECK_EQ(model_check_, kParserModelCheck) << "The model file is too old and not supported anymore."; - success = ReadUINT64(fs, &model_version); + success = ReadUINT64(fs, &model_version_); CHECK(success); - CHECK_GE(model_version, kOldestCompatibleParserModelVersion) + CHECK_GE(model_version_, kOldestCompatibleParserModelVersion) << "The model file is too old and not supported anymore."; delete token_dictionary_; CreateTokenDictionary(); @@ -1531,4 +1529,4 @@ void DependencyPipe::LabelInstance(Parts *parts, const vector &output, } } } -} +} diff --git a/src/parser/DependencyPipe.h b/src/parser/DependencyPipe.h index bd35804..0417d63 100644 --- a/src/parser/DependencyPipe.h +++ b/src/parser/DependencyPipe.h @@ -223,7 +223,7 @@ class DependencyPipe : public Pipe { num_head_pruned_mistakes_ = 0; num_heads_after_pruning_ = 0; num_tokens_ = 0; - gettimeofday(&start_clock_, NULL); + chrono.GetTime(); } virtual void EvaluateInstance(Instance *instance, Instance *output_instance, @@ -269,10 +269,8 @@ class DependencyPipe : public Pipe { static_cast(num_heads_after_pruning_) / static_cast(num_tokens_) << " possible heads per token."; - timeval end_clock; - gettimeofday(&end_clock, NULL); - double num_seconds = - static_cast(diff_ms(end_clock, start_clock_)) / 1000.0; + chrono.StopTime(); + double num_seconds = chrono.GetElapsedTime(); double tokens_per_second = static_cast(num_tokens_) / num_seconds; LOG(INFO) << "Parsing speed: " << tokens_per_second << " tokens per second."; @@ -296,7 +294,7 @@ class DependencyPipe : public Pipe { int num_head_pruned_mistakes_; int num_heads_after_pruning_; int num_tokens_; - timeval start_clock_; + chronowrap::Chronometer chrono; }; #endif /* DEPENDENCYPIPE_H_ */ diff --git a/src/parser/DependencyWriter.cpp b/src/parser/DependencyWriter.cpp index 2f53abd..a6bcf9b 100644 --- a/src/parser/DependencyWriter.cpp +++ b/src/parser/DependencyWriter.cpp @@ -37,3 +37,5 @@ void DependencyWriter::Write(Instance *instance) { } os_ << endl; } + +void DependencyWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} diff --git a/src/parser/DependencyWriter.h b/src/parser/DependencyWriter.h index 2138bc1..84c98c0 100644 --- a/src/parser/DependencyWriter.h +++ b/src/parser/DependencyWriter.h @@ -31,6 +31,7 @@ class DependencyWriter : public Writer { public: void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); }; #endif /* DEPENDENCYWRITER_H_ */ diff --git a/src/parser/TurboParser.cpp b/src/parser/TurboParser.cpp index 3d4173a..10125c2 100644 --- a/src/parser/TurboParser.cpp +++ b/src/parser/TurboParser.cpp @@ -1,129 +1,127 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "DependencyPipe.h" - -using namespace std; - -void TrainParser(); -void TestParser(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_train) { - LOG(INFO) << "Training parser..." << endl; - TrainParser(); - } else if (FLAGS_test) { - LOG(INFO) << "Running parser..." << endl; - TestParser(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainParser() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - DependencyOptions *options = new DependencyOptions; - options->Initialize(); - - DependencyPipe *pipe = new DependencyPipe(options); - pipe->Initialize(); - - if (options->prune_basic()) { - if (options->use_pretrained_pruner()) { - pipe->LoadPrunerModelFile(); - } else { - // Train the pruner. - LOG(INFO) << "Training the pruner..."; - DependencyOptions *pruner_options = new DependencyOptions; - *pruner_options = *options; - // Transform things such as pruner_train_algorithm - // in train_algorithm. - pruner_options->CopyPrunerFlags(); - pruner_options->Initialize(); - DependencyPipe *pruner_pipe = new DependencyPipe(pruner_options); - pruner_pipe->Initialize(); - - pruner_pipe->Train(); - pipe->SetPrunerParameters(pruner_pipe->GetParameters()); - // This is necessary so that the pruner parameters are not - // destroyed when deleting the pruner pipe. - pruner_pipe->SetParameters(NULL); - - delete pruner_pipe; - delete pruner_options; - } - } - - LOG(INFO) << "Training the parser..."; - pipe->Train(); - pipe->SaveModelFile(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TestParser() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - DependencyOptions *options = new DependencyOptions; - options->Initialize(); - - DependencyPipe *pipe = new DependencyPipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - pipe->Run(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "DependencyPipe.h" + +using namespace std; + +void TrainParser(); +void TestParser(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_train) { + LOG(INFO) << "Training parser..." << endl; + TrainParser(); + } else if (FLAGS_test) { + LOG(INFO) << "Running parser..." << endl; + TestParser(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainParser() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + DependencyOptions *options = new DependencyOptions; + options->Initialize(); + + DependencyPipe *pipe = new DependencyPipe(options); + pipe->Initialize(); + + if (options->prune_basic()) { + if (options->use_pretrained_pruner()) { + pipe->LoadPrunerModelFile(); + } else { + // Train the pruner. + LOG(INFO) << "Training the pruner..."; + DependencyOptions *pruner_options = new DependencyOptions; + *pruner_options = *options; + // Transform things such as pruner_train_algorithm + // in train_algorithm. + pruner_options->CopyPrunerFlags(); + pruner_options->Initialize(); + DependencyPipe *pruner_pipe = new DependencyPipe(pruner_options); + pruner_pipe->Initialize(); + + pruner_pipe->Train(); + pipe->SetPrunerParameters(pruner_pipe->GetParameters()); + // This is necessary so that the pruner parameters are not + // destroyed when deleting the pruner pipe. + pruner_pipe->SetParameters(NULL); + + delete pruner_pipe; + delete pruner_options; + } + } + + LOG(INFO) << "Training the parser..."; + pipe->Train(); + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestParser() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + DependencyOptions *options = new DependencyOptions; + options->Initialize(); + + DependencyPipe *pipe = new DependencyPipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/semantic_parser/SemanticDecoder.cpp b/src/semantic_parser/SemanticDecoder.cpp index 75f93f1..8618996 100644 --- a/src/semantic_parser/SemanticDecoder.cpp +++ b/src/semantic_parser/SemanticDecoder.cpp @@ -1,1660 +1,1660 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "SemanticDecoder.h" -#include "SemanticPart.h" -#include "SemanticPipe.h" -#include "AlgUtils.h" -#include -#include -#include "logval.h" -#include "ad3/FactorGraph.h" -#include "FactorSemanticGraph.h" -#include "FactorPredicateAutomaton.h" -#include "FactorArgumentAutomaton.h" - -// Define a matrix of doubles using Eigen. -typedef LogVal LogValD; -namespace Eigen { -typedef Eigen::Matrix MatrixXlogd; -} - -using namespace std; - -DEFINE_double(srl_train_cost_false_positives, 1.0, - "Cost for predicting false positives."); -DEFINE_double(srl_train_cost_false_negatives, 1.0, - "Cost for predicting false negatives."); - -void SemanticDecoder::DecodeCostAugmented(Instance *instance, Parts *parts, - const vector &scores, - const vector &gold_output, - vector *predicted_output, - double *cost, - double *loss) { - SemanticParts *semantic_parts = static_cast(parts); - int offset_arcs, num_arcs; - - // TODO(atm): make it possible to penalize wrong predicate parts as well? - // Or unlabeled arcs in addition to labeled arcs? - if (pipe_->GetSemanticOptions()->labeled()) { - semantic_parts->GetOffsetLabeledArc(&offset_arcs, &num_arcs); - } else { - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - } - - //////////////////////////////////////////////////// - // F1: a = 0.5, b = 0.5. - // Recall: a = 0, b = 1. - // In general: - // p = a - (a+b)*z0 - // q = b*sum(z0) - // p'*z + q = a*sum(z) - (a+b)*z0'*z + b*sum(z0) - // = a*(1-z0)'*z + b*(1-z)'*z0. - //////////////////////////////////////////////////// - - // Penalty for predicting 1 when it is 0 (FP). - double a = FLAGS_srl_train_cost_false_positives; - // Penalty for predicting 0 when it is 1 (FN). - double b = FLAGS_srl_train_cost_false_negatives; - - // p = 0.5-z0, q = 0.5'*z0, loss = p'*z + q - double q = 0.0; - vector p(num_arcs, 0.0); - - vector scores_cost = scores; - for (int r = 0; r < num_arcs; ++r) { - p[r] = a - (a + b) * gold_output[offset_arcs + r]; - scores_cost[offset_arcs + r] += p[r]; - q += b*gold_output[offset_arcs + r]; - } - - Decode(instance, parts, scores_cost, predicted_output); - - *cost = q; - for (int r = 0; r < num_arcs; ++r) { - *cost += p[r] * (*predicted_output)[offset_arcs + r]; - } - - *loss = *cost; - for (int r = 0; r < parts->size(); ++r) { - *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); - } - -#if 0 - for (int k = 0; k < 2; ++k) { - const vector *output; - string type; - if (k == 0) { - type = "gold"; - output = &gold_output; - } else { - type = "predicted"; - output = predicted_output; - } - for (int r = 0; r < parts->size(); ++r) { - int offset_pred_, offset_arcs_, offset_labeled_arcs_, offset_siblings_, - offset_consecutive_siblings_, num_; - semantic_parts->GetOffsetPredicate(&offset_pred_, &num_); - semantic_parts->GetOffsetArc(&offset_arcs_, &num_); - semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs_, &num_); - semantic_parts->GetOffsetSibling(&offset_siblings_, &num_); - semantic_parts->GetOffsetConsecutiveSibling(&offset_consecutive_siblings_, &num_); - if (r >= offset_consecutive_siblings_) { - SemanticPartConsecutiveSibling *sibling = - static_cast((*parts)[r]); - if ((*output)[r] > 0) { - LOG(INFO) << type << " consec sibling: " << "[" << r << "]" << " " - << sibling->predicate() << " " - << sibling->sense() << " " - << sibling->first_argument() << " " - << sibling->second_argument() << " " - << (*output)[r]; - } - } else if (r >= offset_siblings_) { - SemanticPartSibling *sibling = - static_cast((*parts)[r]); - if ((*output)[r] > 0) { - LOG(INFO) << type << " sibling: " << "[" << r << "]" << " " - << sibling->predicate() << " " - << sibling->sense() << " " - << sibling->first_argument() << " " - << sibling->second_argument() << " " - << (*output)[r]; - } - } else if (r >= offset_labeled_arcs_) { - SemanticPartLabeledArc *labeled_arc = - static_cast((*parts)[r]); - if ((*output)[r] > 0) { - LOG(INFO) << type << " labeled_arc: " << "[" << r << "]" << " " - << labeled_arc->predicate() << " " - << labeled_arc->sense() << " " - << labeled_arc->argument() << " " - << labeled_arc->role() << " " - << (*output)[r]; - } - } else if (r >= offset_arcs_) { - SemanticPartArc *arc = - static_cast((*parts)[r]); - if ((*output)[r] > 0) { - LOG(INFO) << type << " arc: " << "[" << r << "]" << " " - << arc->predicate() << " " - << arc->sense() << " " - << arc->argument() << " " - << (*output)[r]; - } - } else if (r >= offset_pred_) { - SemanticPartPredicate *predicate = - static_cast((*parts)[r]); - if ((*output)[r] > 0) { - LOG(INFO) << type << " predicate: " << "[" << r << "]" << " " - << predicate->predicate() << " " - << predicate->sense() << " " - << (*output)[r]; - } - } else { - CHECK(false); - } - } - } -#endif -} - -void SemanticDecoder::DecodeMarginals(Instance *instance, Parts *parts, - const vector &scores, - const vector &gold_output, - vector *predicted_output, - double *entropy, - double *loss) { - SemanticParts *semantic_parts = static_cast(parts); - - // Right now, only allow marginal inference for arc-factored models. - CHECK(semantic_parts->IsArcFactored()); - - // Create copy of the scores. - vector copied_scores(scores); - vector total_scores; - vector label_marginals; - int offset_predicate_parts, num_predicate_parts; - int offset_arcs, num_arcs; - int offset_labeled_arcs, num_labeled_arcs; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs, - &num_labeled_arcs); - - // If labeled parsing, decode the labels and update the scores. - if (pipe_->GetSemanticOptions()->labeled()) { - DecodeLabelMarginals(instance, parts, copied_scores, &total_scores, - &label_marginals); - for (int r = 0; r < total_scores.size(); ++r) { - // Sum the "labeled" scores to the (eventually) already existing - // "unlabeled" scores. - copied_scores[offset_arcs + r] += total_scores[r]; - } - } - - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - - double log_partition_function; - DecodeBasicMarginals(instance, parts, copied_scores, predicted_output, - &log_partition_function, entropy); - - // If labeled parsing, write the components of the predicted output that - // correspond to the labeled parts. - if (pipe_->GetSemanticOptions()->labeled()) { - for (int r = 0; r < num_labeled_arcs; ++r) { - SemanticPartLabeledArc *labeled_arc = - static_cast( - (*parts)[offset_labeled_arcs + r]); - int index_arc = semantic_parts->FindArc(labeled_arc->predicate(), - labeled_arc->argument(), - labeled_arc->sense()); - CHECK_GE(index_arc, 0); - (*predicted_output)[offset_labeled_arcs + r] = - label_marginals[r] * (*predicted_output)[index_arc]; - } - - // Recompute the entropy. - *entropy = log_partition_function; - for (int r = 0; r < num_predicate_parts; ++r) { - *entropy -= (*predicted_output)[offset_predicate_parts + r] * - scores[offset_predicate_parts + r]; - } - for (int r = 0; r < num_arcs; ++r) { - *entropy -= (*predicted_output)[offset_arcs + r] * - scores[offset_arcs + r]; - } - for (int r = 0; r < num_labeled_arcs; ++r) { - *entropy -= (*predicted_output)[offset_labeled_arcs + r] * - scores[offset_labeled_arcs + r]; - } - if (*entropy < 0.0) { - LOG(INFO) << "Entropy truncated to zero (" << *entropy << ")"; - *entropy = 0.0; - } - } - - *loss = *entropy; - for (int r = 0; r < parts->size(); ++r) { - *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); - } - if (*loss < 0.0) { - LOG(INFO) << "Loss truncated to zero (" << *loss << ")"; - *loss = 0.0; - } -} - -// Decode the best label for each candidate arc. The output vector -// best_labeled_parts, indexed by the unlabeled arcs, contains the indices -// of the best labeled part for each arc. -void SemanticDecoder::DecodeLabels(Instance *instance, Parts *parts, - const vector &scores, - vector *best_labeled_parts) { - SemanticParts *semantic_parts = static_cast(parts); - - int offset, num_arcs; - semantic_parts->GetOffsetArc(&offset, &num_arcs); - best_labeled_parts->resize(num_arcs); - for (int r = 0; r < num_arcs; ++r) { - SemanticPartArc *arc = - static_cast((*parts)[offset + r]); - const vector &index_labeled_parts = - semantic_parts->FindLabeledArcs(arc->predicate(), - arc->argument(), - arc->sense()); - // Find the best label for each candidate arc. - int best_label = -1; - double best_score; - for (int k = 0; k < index_labeled_parts.size(); ++k) { - if (best_label < 0 || - scores[index_labeled_parts[k]] > best_score) { - best_label = index_labeled_parts[k]; - best_score = scores[best_label]; - } - } - (*best_labeled_parts)[r] = best_label; - } -} - -// Decode the label marginals for each candidate arc. The output vector -// total_scores contains the sum of exp-scores (over the labels) for each arc; -// label_marginals contains those marginals ignoring the tree constraint. -void SemanticDecoder::DecodeLabelMarginals(Instance *instance, Parts *parts, - const vector &scores, - vector *total_scores, - vector *label_marginals) { - SemanticParts *semantic_parts = static_cast(parts); - - int offset, num_arcs; - int offset_labeled, num_labeled_arcs; - semantic_parts->GetOffsetArc(&offset, &num_arcs); - semantic_parts->GetOffsetLabeledArc(&offset_labeled, &num_labeled_arcs); - total_scores->clear(); - total_scores->resize(num_arcs, 0.0); - label_marginals->clear(); - label_marginals->resize(num_labeled_arcs, 0.0); - - for (int r = 0; r < num_arcs; ++r) { - SemanticPartArc *arc = - static_cast((*parts)[offset + r]); - const vector &index_labeled_parts = - semantic_parts->FindLabeledArcs(arc->predicate(), - arc->argument(), - arc->sense()); - // Find the best label for each candidate arc. - LogValD total_score = LogValD::Zero(); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - total_score += LogValD(scores[index_labeled_parts[k]], false); - } - (*total_scores)[r] = total_score.logabs(); - double sum = 0.0; - for (int k = 0; k < index_labeled_parts.size(); ++k) { - LogValD marginal = - LogValD(scores[index_labeled_parts[k]], false) / total_score; - (*label_marginals)[index_labeled_parts[k] - offset_labeled] = - marginal.as_float(); - sum += marginal.as_float(); - } - if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) { - LOG(INFO) << "Label marginals don't sum to one: sum = " << sum; - } - } -} - -void SemanticDecoder::Decode(Instance *instance, Parts *parts, - const vector &scores, - vector *predicted_output) { - SemanticParts *semantic_parts = static_cast(parts); - - // Create copy of the scores. - vector copied_scores(scores); - vector best_labeled_parts; - int offset_labeled_arcs, num_labeled_arcs; - semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs, - &num_labeled_arcs); - int offset_arcs, num_arcs; - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - int offset_predicate_parts, num_predicate_parts; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - - bool labeled_decoding = false; - // TODO: change this test. - int offset_labeled_siblings, num_labeled_siblings; - semantic_parts->GetOffsetLabeledSibling(&offset_labeled_siblings, - &num_labeled_siblings); - if (num_labeled_siblings > 0) labeled_decoding = true; - if (pipe_->GetSemanticOptions()->deterministic_labels()) { - labeled_decoding = true; - } - - if (labeled_decoding) { - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - - DecodeFactorGraph(instance, parts, copied_scores, labeled_decoding, true, - predicted_output); - - // At test time, run a basic decoder on top of the outcome of AD3 - // as a rounding heuristic to make sure we get a valid graph. - // TODO: maybe change the interface to AD3 to let us implement - // a primal rounding heuristic. - if (pipe_->GetSemanticOptions()->test()) { - double threshold = 0.5; - for (int r = 0; r < num_arcs; ++r) { - copied_scores[offset_arcs + r] = 0.0; - } - for (int r = 0; r < num_predicate_parts; ++r) { - copied_scores[offset_predicate_parts + r] = 0.0; - } - for (int r = 0; r < num_labeled_arcs; ++r) { - copied_scores[offset_labeled_arcs + r] = - (*predicted_output)[offset_labeled_arcs + r] - threshold; - } - - DecodeLabels(instance, parts, copied_scores, &best_labeled_parts); - for (int r = 0; r < best_labeled_parts.size(); ++r) { - // Sum the "labeled" scores to the (eventually) already existing - // "unlabeled" scores. - copied_scores[offset_arcs + r] += copied_scores[best_labeled_parts[r]]; - } - - double value; - predicted_output->assign(parts->size(), 0.0); - DecodeBasic(instance, parts, copied_scores, predicted_output, &value); - - // Write the components of the predicted output that - // correspond to the labeled parts. - for (int r = 0; r < num_arcs; ++r) { - CHECK_GE(best_labeled_parts[r], offset_arcs + num_arcs); - (*predicted_output)[best_labeled_parts[r]] = - (*predicted_output)[offset_arcs + r]; - } - } - } else { - // If labeled parsing, decode the labels and update the scores. - if (pipe_->GetSemanticOptions()->labeled()) { - DecodeLabels(instance, parts, copied_scores, &best_labeled_parts); - for (int r = 0; r < best_labeled_parts.size(); ++r) { - // Sum the "labeled" scores to the (eventually) already existing - // "unlabeled" scores. - copied_scores[offset_arcs + r] += copied_scores[best_labeled_parts[r]]; - } - } - - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - - if (semantic_parts->IsArcFactored() || - semantic_parts->IsLabeledArcFactored()) { - double value; - DecodeBasic(instance, parts, copied_scores, predicted_output, &value); - } else { - DecodeFactorGraph(instance, parts, copied_scores, labeled_decoding, true, - predicted_output); - - // At test time, run a basic decoder on top of the outcome of AD3 - // as a rounding heuristic to make sure we get a valid graph. - // TODO: maybe change the interface to AD3 to let us implement - // a primal rounding heuristic. - if (pipe_->GetSemanticOptions()->test()) { - double threshold = 0.5; - for (int r = 0; r < num_arcs; ++r) { - copied_scores[offset_arcs + r] = - (*predicted_output)[offset_arcs + r] - threshold; - } - for (int r = 0; r < num_predicate_parts; ++r) { - copied_scores[offset_predicate_parts + r] = 0.0; - } - // This is not strictly necessary (since labeled arcs are not used by - // DecodeBasic), but should not harm. - for (int r = 0; r < num_labeled_arcs; ++r) { - copied_scores[offset_labeled_arcs + r] = 0.0; - } - predicted_output->assign(parts->size(), 0.0); - double value; - DecodeBasic(instance, parts, copied_scores, predicted_output, &value); - } - } - - // If labeled parsing, write the components of the predicted output that - // correspond to the labeled parts. - if (pipe_->GetSemanticOptions()->labeled()) { - for (int r = 0; r < num_arcs; ++r) { - CHECK_GE(best_labeled_parts[r], offset_arcs + num_arcs); - (*predicted_output)[best_labeled_parts[r]] = - (*predicted_output)[offset_arcs + r]; - } - } - } -} - -// Build predicate and arc indices. -void SemanticDecoder::BuildBasicIndices( - int sentence_length, - const vector &predicate_parts, - const vector &arcs, - vector > *index_predicates, - vector > > *arcs_by_predicate) { - int num_arcs = arcs.size(); - int num_predicate_parts = predicate_parts.size(); - - arcs_by_predicate->assign(sentence_length, vector >()); - for (int r = 0; r < num_arcs; ++r) { - int p = arcs[r]->predicate(); - int s = arcs[r]->sense(); - if (s >= (*arcs_by_predicate)[p].size()) { - (*arcs_by_predicate)[p].resize(s + 1); - } - (*arcs_by_predicate)[p][s].push_back(r); - } - - index_predicates->assign(sentence_length, vector()); - for (int r = 0; r < num_predicate_parts; ++r) { - CHECK_LT(r, predicate_parts.size()); - int p = predicate_parts[r]->predicate(); - int s = predicate_parts[r]->sense(); - if (s >= (*index_predicates)[p].size()) { - (*index_predicates)[p].resize(s + 1, -1); - } - (*index_predicates)[p][s] = r; - } -} - -void SemanticDecoder::DecodePruner(Instance *instance, Parts *parts, - const vector &scores, - vector *predicted_output) { - //DecodePrunerNaive(instance, parts, scores, predicted_output); - //return; - - int sentence_length = - static_cast(instance)->size(); - SemanticParts *semantic_parts = static_cast(parts); - double posterior_threshold = - pipe_->GetSemanticOptions()->GetPrunerPosteriorThreshold(); - int max_arguments = pipe_->GetSemanticOptions()->GetPrunerMaxArguments(); - if (max_arguments < 0) max_arguments = sentence_length; - - int offset_predicate_parts, num_predicate_parts; - int offset_arcs, num_arcs; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - - vector arcs(num_arcs); - vector scores_arcs(num_arcs); - for (int r = 0; r < num_arcs; ++r) { - arcs[r] = static_cast((*parts)[offset_arcs + r]); - scores_arcs[r] = scores[offset_arcs + r]; - } - - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - - CHECK(semantic_parts->IsArcFactored()); - - vector > > arcs_by_predicate; - arcs_by_predicate.resize(sentence_length); - for (int r = 0; r < num_arcs; ++r) { - int p = arcs[r]->predicate(); - int s = arcs[r]->sense(); - if (s >= arcs_by_predicate[p].size()) { - arcs_by_predicate[p].resize(s + 1); - } - arcs_by_predicate[p][s].push_back(r); - } - - double entropy; - double log_partition_function; - vector posteriors; - DecodeBasicMarginals(instance, parts, scores, &posteriors, - &log_partition_function, &entropy); - - // Get max_arguments argumens per predicate. - int num_used_parts = 0; - for (int p = 0; p < sentence_length; ++p) { - for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { - vector > scores_arguments; - for (int a = 1; a < sentence_length; ++a) { - int r = semantic_parts->FindArc(p, a, s); - if (r < 0) continue; - scores_arguments.push_back(pair(-posteriors[r], r)); - } - if (scores_arguments.size() == 0) continue; - sort(scores_arguments.begin(), scores_arguments.end()); - double max_posterior = 1.0; // -scores_arguments[0].first; - for (int k = 0; k < max_arguments && k < scores_arguments.size(); ++k) { - int r = scores_arguments[k].second; - if (-scores_arguments[k].first >= posterior_threshold * max_posterior) { - ++num_used_parts; - (*predicted_output)[r] = 1.0; - } else { - break; - } - } - } - } - - VLOG(2) << "Pruning reduced to " - << static_cast(num_used_parts) / - static_cast(sentence_length) - << " candidate heads per word."; -} - -void SemanticDecoder::DecodePrunerNaive(Instance *instance, Parts *parts, - const vector &scores, - vector *predicted_output) { - int sentence_length = - static_cast(instance)->size(); - SemanticParts *semantic_parts = static_cast(parts); - int max_arguments = pipe_->GetSemanticOptions()->GetPrunerMaxArguments(); - int offset_predicate_parts, num_predicate_parts; - int offset_arcs, num_arcs; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - - vector arcs(num_arcs); - vector scores_arcs(num_arcs); - for (int r = 0; r < num_arcs; ++r) { - arcs[r] = static_cast((*parts)[offset_arcs + r]); - scores_arcs[r] = scores[offset_arcs + r]; - } - - predicted_output->clear(); - predicted_output->resize(parts->size(), 0.0); - for (int r = 0; r < num_predicate_parts; ++r) { - // Don't prune any of the predicate parts. - (*predicted_output)[offset_predicate_parts + r] = 1.0; - } - - CHECK(semantic_parts->IsArcFactored()); - - vector > > arcs_by_predicate; - arcs_by_predicate.resize(sentence_length); - for (int r = 0; r < num_arcs; ++r) { - int p = arcs[r]->predicate(); - int s = arcs[r]->sense(); - if (s >= arcs_by_predicate[p].size()) { - arcs_by_predicate[p].resize(s + 1); - } - arcs_by_predicate[p][s].push_back(r); - } - - // Get max_arguments argumens per predicate. - for (int p = 0; p < sentence_length; ++p) { - for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { - vector > scores_arguments; - for (int a = 1; a < sentence_length; ++a) { - int r = semantic_parts->FindArc(p, a, s); - if (r < 0) continue; - scores_arguments.push_back(pair(-scores[r], r)); - } - sort(scores_arguments.begin(), scores_arguments.end()); - for (int k = 0; k < max_arguments && k < scores_arguments.size(); ++k) { - int r = scores_arguments[k].second; - (*predicted_output)[r] = 1.0; - //LOG(INFO) << "Keeping arc (" << p << ", " - // << s << ", " - // << static_cast((*parts)[r])->argument() - // << ")."; - } - } - } -} - -// Decoder for the basic model. For each predicate, choose the best -// sense and the best set of arcs independently. -void SemanticDecoder::DecodeBasic(Instance *instance, Parts *parts, - const vector &scores, - vector *predicted_output, - double *value) { - int sentence_length = - static_cast(instance)->size(); - SemanticParts *semantic_parts = static_cast(parts); - int offset_predicate_parts, num_predicate_parts; - int offset_arcs, num_arcs; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - - vector arcs(num_arcs); - vector scores_arcs(num_arcs); - for (int r = 0; r < num_arcs; ++r) { - arcs[r] = static_cast((*parts)[offset_arcs + r]); - scores_arcs[r] = scores[offset_arcs + r]; - } - - vector predicate_parts(num_predicate_parts); - vector scores_predicates(num_predicate_parts); - for (int r = 0; r < num_predicate_parts; ++r) { - predicate_parts[r] = - static_cast((*parts)[offset_predicate_parts + r]); - scores_predicates[r] = scores[offset_predicate_parts + r]; - } - - vector > > arcs_by_predicate; - vector > index_predicates; - vector selected_predicates; - vector selected_arcs; - BuildBasicIndices(sentence_length, predicate_parts, arcs, &index_predicates, - &arcs_by_predicate); - - DecodeSemanticGraph(sentence_length, predicate_parts, arcs, index_predicates, - arcs_by_predicate, scores_predicates, scores_arcs, - &selected_predicates, &selected_arcs, value); - - predicted_output->resize(parts->size()); - for (int r = 0; r < num_predicate_parts; ++r) { - if (selected_predicates[r]) { - (*predicted_output)[offset_predicate_parts + r] = 1.0; - } else { - (*predicted_output)[offset_predicate_parts + r] = 0.0; - } - } - for (int r = 0; r < num_arcs; ++r) { - if (selected_arcs[r]) { - (*predicted_output)[offset_arcs + r] = 1.0; - } else { - (*predicted_output)[offset_arcs + r] = 0.0; - } - } - -#if 0 - arcs_by_predicate.resize(sentence_length); - for (int r = 0; r < num_arcs; ++r) { - int p = arcs[r]->predicate(); - int s = arcs[r]->sense(); - if (s >= arcs_by_predicate[p].size()) { - arcs_by_predicate[p].resize(s + 1); - } - arcs_by_predicate[p][s].push_back(r); - } - - vector scores_predicates(num_predicate_parts); - //vector > index_predicates(sentence_length); - for (int r = 0; r < num_predicate_parts; ++r) { - scores_predicates[r] = scores[offset_predicate_parts + r]; - SemanticPartPredicate *predicate_part = - static_cast((*parts)[offset_predicate_parts + r]); - int p = predicate_part->predicate(); - int s = predicate_part->sense(); - if (s >= index_predicates[p].size()) { - index_predicates[p].resize(s + 1, -1); - } - index_predicates[p][s] = r; - } - - predicted_output->resize(parts->size()); - for (int r = 0; r < num_predicate_parts; ++r) { - (*predicted_output)[offset_predicate_parts + r] = 0.0; - } - for (int r = 0; r < num_arcs; ++r) { - (*predicted_output)[offset_arcs + r] = 0.0; - } - - double total_score = 0.0; - for (int p = 0; p < sentence_length; ++p) { - int best_sense = -1; - double best_score = 0.0; - vector > selected_arcs; - selected_arcs.resize(arcs_by_predicate[p].size()); - for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { - // Compute the best assignment of arcs departing from this - // predicate word. - selected_arcs[s].resize(arcs_by_predicate[p][s].size()); - int r = index_predicates[p][s]; - double score = scores_predicates[r]; - for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { - int r = arcs_by_predicate[p][s][k]; - if (scores_arcs[r] > 0.0) { - selected_arcs[s][k] = true; - score += scores_arcs[r]; - } else { - selected_arcs[s][k] = false; - } - } - if (score > best_score) { - best_sense = s; - best_score = score; - } - } - if (best_sense >= 0) { - total_score += best_score; - int s = best_sense; - int r = index_predicates[p][s]; - CHECK_GE(r, 0); - (*predicted_output)[offset_predicate_parts + r] = 1.0; - for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { - if (!selected_arcs[s][k]) continue; - int r = arcs_by_predicate[p][s][k]; - (*predicted_output)[offset_arcs + r] = 1.0; - //LOG(INFO) << "Selected arc " - // << arcs[r]->predicate() << " " - // << arcs[r]->argument() << " " - // << arcs[r]->sense(); - } - } - } -#endif -} - -// Decoder for the basic model. For each predicate, choose the best -// sense and the best set of arcs independently. -void SemanticDecoder::DecodeBasicMarginals(Instance *instance, Parts *parts, - const vector &scores, - vector *predicted_output, - double *log_partition_function, - double *entropy) { - int sentence_length = - static_cast(instance)->size(); - SemanticParts *semantic_parts = static_cast(parts); - int offset_predicate_parts, num_predicate_parts; - int offset_arcs, num_arcs; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - - vector arcs(num_arcs); - vector scores_arcs(num_arcs); - for (int r = 0; r < num_arcs; ++r) { - arcs[r] = static_cast((*parts)[offset_arcs + r]); - scores_arcs[r] = scores[offset_arcs + r]; - } - - vector > > arcs_by_predicate; - arcs_by_predicate.resize(sentence_length); - for (int r = 0; r < num_arcs; ++r) { - int p = arcs[r]->predicate(); - int s = arcs[r]->sense(); - if (s >= arcs_by_predicate[p].size()) { - arcs_by_predicate[p].resize(s + 1); - } - arcs_by_predicate[p][s].push_back(r); - } - - vector scores_predicates(num_predicate_parts); - vector > index_predicates(sentence_length); - for (int r = 0; r < num_predicate_parts; ++r) { - scores_predicates[r] = scores[offset_predicate_parts + r]; - SemanticPartPredicate *predicate_part = - static_cast((*parts)[offset_predicate_parts + r]); - int p = predicate_part->predicate(); - int s = predicate_part->sense(); - if (s >= index_predicates[p].size()) { - index_predicates[p].resize(s + 1, -1); - } - index_predicates[p][s] = r; - } - - predicted_output->resize(parts->size()); - for (int r = 0; r < num_predicate_parts; ++r) { - (*predicted_output)[offset_predicate_parts + r] = 0.0; - } - for (int r = 0; r < num_arcs; ++r) { - (*predicted_output)[offset_arcs + r] = 0.0; - } - - *log_partition_function = 0.0; - *entropy = 0.0; - for (int p = 0; p < sentence_length; ++p) { - // Initiliaze log partition all senses to exp(0.0) to account for - // the null sense which implies there are no outgoing arcs. - LogValD log_partition_all_senses = LogValD::One(); - //LogValD log_partition_all_senses = LogValD::Zero(); - vector log_partition_senses(arcs_by_predicate[p].size(), - LogValD::Zero()); - vector > log_partition_arcs(arcs_by_predicate[p].size()); - for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { - int r = index_predicates[p][s]; - double score = scores_predicates[r]; - //CHECK_EQ(score, 0.0); - // Initialize log partition arcs to exp(0.0) to account for the - // event that the arc does not exist. - log_partition_arcs[s].assign(arcs_by_predicate[p][s].size(), - LogValD::One()); - for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { - int r = arcs_by_predicate[p][s][k]; - log_partition_arcs[s][k] += LogValD(scores_arcs[r], false); - //score += log_partition_arcs[s][k].as_float(); - score += log_partition_arcs[s][k].logabs(); - } - //LOG(INFO) << s << " " << score; - log_partition_senses[s] = LogValD(score, false); - //log_partition_senses[s] = LogValD(score); - //LOG(INFO) << s << " " << log_partition_senses[s].logabs(); - log_partition_all_senses += log_partition_senses[s]; - } - - // This makes sure the log partition function does not become -infty for - // predicates that do not have any sense. - if (arcs_by_predicate[p].size() > 0) { - if (false && sentence_length < 5) { - LOG(INFO) << "Log partition[" << p << "] = " - << log_partition_all_senses.logabs(); - } - *log_partition_function += log_partition_all_senses.logabs(); - } - - for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { - int r = index_predicates[p][s]; - double predicate_marginal = LogValD(log_partition_senses[s].logabs() - - log_partition_all_senses.logabs(), - false).as_float(); - //LOG(INFO) << "Predicate marginal[" << p << "][" << s << "] = " - // << predicate_marginal; - (*predicted_output)[offset_predicate_parts + r] = predicate_marginal; - //CHECK_EQ(scores_predicates[r], 0.0); - *entropy -= scores_predicates[r] * predicate_marginal; - for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { - int r = arcs_by_predicate[p][s][k]; - double marginal = LogValD(scores_arcs[r] - - log_partition_arcs[s][k].logabs(), - false).as_float(); - marginal *= predicate_marginal; - if (false && sentence_length < 5) { - LOG(INFO) << "marginal[" << p << "][" << s << "][" << k << "] = " - << marginal << "\t" - << "scores_arcs[" << p << "][" << s << "][" << k << "] = " - << scores_arcs[r]; - } - (*predicted_output)[offset_arcs + r] = marginal; - *entropy -= scores_arcs[r] * marginal; - } - } - } - - *entropy += *log_partition_function; - if (false && sentence_length < 5) { - LOG(INFO) << "Log-partition function:" << *log_partition_function; - LOG(INFO) << "Entropy:" << *entropy; - } -} - -// Decoder for the basic model. For each predicate, choose the best -// sense and the best set of arcs independently. -void SemanticDecoder::DecodeSemanticGraph( - int sentence_length, - const vector &predicate_parts, - const vector &arcs, - const vector > &index_predicates, - const vector > > &arcs_by_predicate, - const vector &predicate_scores, - const vector &arc_scores, - vector *selected_predicates, - vector *selected_arcs, - double *value) { - int num_predicate_parts = predicate_parts.size(); - int num_arcs = arcs.size(); - - selected_predicates->assign(num_predicate_parts, false); - selected_arcs->assign(num_arcs, false); - - double total_score = 0.0; - for (int p = 0; p < sentence_length; ++p) { - int best_sense = -1; - double best_score = 0.0; - vector > selected; - selected.resize(arcs_by_predicate[p].size()); - for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { - // Compute the best assignment of arcs departing from this - // predicate word. - selected[s].resize(arcs_by_predicate[p][s].size()); - int r = index_predicates[p][s]; - double score = predicate_scores[r]; - for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { - int r = arcs_by_predicate[p][s][k]; - if (arc_scores[r] > 0.0) { - selected[s][k] = true; - score += arc_scores[r]; - } else { - selected[s][k] = false; - } - } - // Note: we're allowing a non-null sense (!= -1) without outgoing arcs. - if (score > best_score) { - best_sense = s; - best_score = score; - } - } - if (best_sense >= 0) { - total_score += best_score; - int s = best_sense; - int r = index_predicates[p][s]; - CHECK_GE(r, 0); - (*selected_predicates)[r] = true; - for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { - if (!selected[s][k]) continue; - int r = arcs_by_predicate[p][s][k]; - (*selected_arcs)[r] = true; - } - } - } - *value = total_score; -} - -// Decode building a factor graph and calling the AD3 algorithm. -void SemanticDecoder::DecodeFactorGraph(Instance *instance, Parts *parts, - const vector &scores, - bool labeled_decoding, - bool relax, - vector *predicted_output) { - SemanticParts *semantic_parts = static_cast(parts); - SemanticInstanceNumeric* sentence = - static_cast(instance); - CHECK(relax); - - // Get the offsets for the different parts. - int offset_predicate_parts, num_predicate_parts; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - int offset_arcs, num_arcs; - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - int offset_labeled_arcs, num_labeled_arcs; - semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs, - &num_labeled_arcs); - int offset_siblings, num_siblings; - semantic_parts->GetOffsetSibling(&offset_siblings, &num_siblings); - int offset_labeled_siblings, num_labeled_siblings; - semantic_parts->GetOffsetLabeledSibling(&offset_labeled_siblings, - &num_labeled_siblings); - int offset_consecutive_siblings, num_consecutive_siblings; - semantic_parts->GetOffsetConsecutiveSibling(&offset_consecutive_siblings, - &num_consecutive_siblings); - int offset_grandparents, num_grandparents; - semantic_parts->GetOffsetGrandparent(&offset_grandparents, &num_grandparents); - int offset_coparents, num_coparents; - semantic_parts->GetOffsetCoparent(&offset_coparents, &num_coparents); - int offset_consecutive_coparents, num_consecutive_coparents; - semantic_parts->GetOffsetConsecutiveCoparent(&offset_consecutive_coparents, - &num_consecutive_coparents); - -#if 0 - int offset_grandsiblings, num_grandsiblings; - semantic_parts->GetOffsetGrandSibling(&offset_grandsiblings, &num_grandsiblings); - int offset_trisiblings, num_trisiblings; - dependency_parts->GetOffsetTriSibling(&offset_trisiblings, &num_trisiblings); -#endif - - // Define what parts are used. - bool use_arbitrary_sibling_parts = (num_siblings > 0); - bool use_labeled_arbitrary_sibling_parts = (num_labeled_siblings > 0); - bool use_consecutive_sibling_parts = (num_consecutive_siblings > 0); - bool use_grandparent_parts = (num_grandparents > 0); - bool use_coparent_parts = (num_coparents > 0); - bool use_consecutive_coparent_parts = (num_consecutive_coparents > 0); -#if 0 - bool use_grandsibling_parts = (num_grandsiblings > 0); - bool use_trisibling_parts = (num_trisiblings > 0); -#endif - - if (!labeled_decoding) { - CHECK_EQ(num_labeled_siblings, 0); - CHECK(!pipe_->GetSemanticOptions()->deterministic_labels()); - } - - // Variables of the factor graph. - vector variables; - - // Indices that allow to identify the part corresponding to each variable. - vector part_indices_; - vector additional_part_indices; - vector factor_part_indices_; - - // Create factor graph. - AD3::FactorGraph *factor_graph = new AD3::FactorGraph; - int verbosity = 1; //1; - if (VLOG_IS_ON(2)) { - verbosity = 2; - } - factor_graph->SetVerbosity(verbosity); - - // Build predicate part variables. - int offset_predicate_variables = variables.size(); - for (int r = 0; r < num_predicate_parts; ++r) { - AD3::BinaryVariable* variable = factor_graph->CreateBinaryVariable(); - variable->SetLogPotential(scores[offset_predicate_parts + r]); - variables.push_back(variable); - part_indices_.push_back(offset_predicate_parts + r); - } - - // Build arc variables. - int offset_arc_variables = variables.size(); - for (int r = 0; r < num_arcs; ++r) { - AD3::BinaryVariable* variable = factor_graph->CreateBinaryVariable(); - variable->SetLogPotential(scores[offset_arcs + r]); - variables.push_back(variable); - part_indices_.push_back(offset_arcs + r); - } - - int offset_labeled_arc_variables = variables.size(); - if (labeled_decoding) { - // Build labeled arc variables. - for (int r = 0; r < num_labeled_arcs; ++r) { - AD3::BinaryVariable* variable = factor_graph->CreateBinaryVariable(); - variable->SetLogPotential(scores[offset_labeled_arcs + r]); - variables.push_back(variable); - part_indices_.push_back(offset_labeled_arcs + r); - } - } - - // Build basic semantic graph factor. - vector local_variables(num_predicate_parts + num_arcs); - vector predicate_parts(num_predicate_parts); - for (int r = 0; r < num_predicate_parts; ++r) { - local_variables[r] = variables[offset_predicate_variables + r]; - predicate_parts[r] = - static_cast((*parts)[offset_predicate_parts + r]); - } - vector arcs(num_arcs); - for (int r = 0; r < num_arcs; ++r) { - local_variables[num_predicate_parts + r] = - variables[offset_arc_variables + r]; - arcs[r] = static_cast((*parts)[offset_arcs + r]); - } - AD3::FactorSemanticGraph *factor = new AD3::FactorSemanticGraph; - factor->Initialize(sentence->size(), predicate_parts, arcs, this); - factor_graph->DeclareFactor(factor, local_variables, true); - factor_part_indices_.push_back(-1); - - if (labeled_decoding) { - // Build XOR-OUT factors to impose that each arc has a unique label. - for (int r = 0; r < num_arcs; ++r) { - const vector &index_labeled_parts = - semantic_parts->GetLabeledParts(offset_arcs + r); - vector - local_variables(index_labeled_parts.size() + 1); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int index_part = index_labeled_parts[k]; - CHECK_GE(index_part, 0); - CHECK_LT(offset_labeled_arc_variables + index_part - offset_labeled_arcs, - variables.size()); - local_variables[k] = variables[offset_labeled_arc_variables + - index_part - offset_labeled_arcs]; - } - CHECK_GE(offset_arc_variables + r, 0); - CHECK_LT(offset_arc_variables + r, variables.size()); - local_variables[index_labeled_parts.size()] = - variables[offset_arc_variables + r]; - factor_graph->CreateFactorXOROUT(local_variables); - factor_part_indices_.push_back(-1); - } - - // If some labels are deterministic, make sure the same role is not filled - // more than once for each predicate. - if (pipe_->GetSemanticOptions()->deterministic_labels()) { - int sentence_length = - static_cast(instance)->size(); - vector > > - labeled_arcs_by_predicate_role(sentence_length); - for (int r = 0; r < num_labeled_arcs; ++r) { - SemanticPartLabeledArc *labeled_arc = - static_cast( - (*semantic_parts)[offset_labeled_arcs + r]); - int p = labeled_arc->predicate(); - int l = labeled_arc->role(); - // Skip if role l is not deterministic. - if (!pipe_->GetSemanticDictionary()->IsRoleDeterministic(l)) continue; - if (labeled_arcs_by_predicate_role[p].size() <= l) { - labeled_arcs_by_predicate_role[p].resize(l + 1); - } - labeled_arcs_by_predicate_role[p][l].push_back(r); - } - for (int p = 0; p < labeled_arcs_by_predicate_role.size(); ++p) { - for (int l = 0; l < labeled_arcs_by_predicate_role[p].size(); ++l) { - if (labeled_arcs_by_predicate_role[p][l].size() <= 1) continue; - vector - local_variables(labeled_arcs_by_predicate_role[p][l].size()); - for (int k = 0; k < labeled_arcs_by_predicate_role[p][l].size(); - ++k) { - int r = labeled_arcs_by_predicate_role[p][l][k]; - local_variables[k] = variables[offset_labeled_arc_variables + r]; - } - factor_graph->CreateFactorAtMostOne(local_variables); - factor_part_indices_.push_back(-1); - } - } - } - } - - ////////////////////////////////////////////////////////////////////// - // Build sibling factors. - ////////////////////////////////////////////////////////////////////// - if (use_arbitrary_sibling_parts) { - for (int r = 0; r < num_siblings; ++r) { - SemanticPartSibling *part = static_cast( - (*semantic_parts)[offset_siblings + r]); - int r1 = semantic_parts->FindArc(part->predicate(), - part->first_argument(), - part->sense()); - int r2 = semantic_parts->FindArc(part->predicate(), - part->second_argument(), - part->sense()); - CHECK_GE(r1, 0); - CHECK_GE(r2, 0); - vector local_variables; - local_variables.push_back(variables[r1 - offset_arcs + - offset_arc_variables]); - local_variables.push_back(variables[r2 - offset_arcs + - offset_arc_variables]); - - factor_graph->CreateFactorPAIR(local_variables, - scores[offset_siblings + r]); - // TODO: set these global indices at the end after all variables/factors - // are created. - //factor->SetGlobalIndex(...); - additional_part_indices.push_back(offset_siblings + r); - factor_part_indices_.push_back(offset_siblings + r); - } - } - - ////////////////////////////////////////////////////////////////////// - // Build labeled sibling factors. - ////////////////////////////////////////////////////////////////////// - if (use_labeled_arbitrary_sibling_parts) { - CHECK(labeled_decoding); - for (int r = 0; r < num_labeled_siblings; ++r) { - SemanticPartLabeledSibling *part = - static_cast( - (*semantic_parts)[offset_labeled_siblings + r]); - int r1 = semantic_parts->FindLabeledArc(part->predicate(), - part->first_argument(), - part->sense(), - part->first_role()); - int r2 = semantic_parts->FindLabeledArc(part->predicate(), - part->second_argument(), - part->sense(), - part->second_role()); - CHECK_GE(r1, 0); - CHECK_GE(r2, 0); - vector local_variables; - local_variables.push_back(variables[r1 - offset_labeled_arcs + - offset_labeled_arc_variables]); - local_variables.push_back(variables[r2 - offset_labeled_arcs + - offset_labeled_arc_variables]); - - factor_graph->CreateFactorPAIR(local_variables, - scores[offset_labeled_siblings + r]); - // TODO: set these global indices at the end after all variables/factors - // are created. - //factor->SetGlobalIndex(...); - additional_part_indices.push_back(offset_labeled_siblings + r); - factor_part_indices_.push_back(offset_labeled_siblings + r); - } - } - - ////////////////////////////////////////////////////////////////////// - // Build consecutive sibling factors. - ////////////////////////////////////////////////////////////////////// - if (use_consecutive_sibling_parts) { - // Get all the consecutive siblings, indices, etc. - vector > predicate_part_indices(sentence->size()); - for (int r = 0; r < num_predicate_parts; ++r) { - SemanticPartPredicate* predicate_part = - static_cast( - (*parts)[offset_predicate_parts + r]); - predicate_part_indices[predicate_part->predicate()]. - push_back(offset_predicate_parts + r); - } - vector > left_arc_indices(sentence->size()); - vector > right_arc_indices(sentence->size()); - for (int r = 0; r < num_arcs; ++r) { - SemanticPartArc* arc = - static_cast((*parts)[offset_arcs + r]); - // Handle self-loops (p=a) in the right side automaton. - if (arc->predicate() > arc->argument()) { - left_arc_indices[arc->predicate()].push_back(offset_arcs + r); - } else { - right_arc_indices[arc->predicate()].push_back(offset_arcs + r); - } - } - vector > - left_siblings(sentence->size()); - vector > - right_siblings(sentence->size()); - vector > left_scores(sentence->size()); - vector > right_scores(sentence->size()); - vector > left_indices(sentence->size()); - vector > right_indices(sentence->size()); - for (int r = 0; r < num_consecutive_siblings; ++r) { - SemanticPartConsecutiveSibling *sibling = - static_cast( - (*parts)[offset_consecutive_siblings + r]); - // TODO: Try to disable self loops on the left side? - // Make sure no non-basic sibling part ends up in two - // factors. - if (sibling->predicate() > sibling->second_argument()) { - // Left sibling. - left_siblings[sibling->predicate()].push_back(sibling); - left_scores[sibling->predicate()].push_back( - scores[offset_consecutive_siblings + r]); - // Save the part index to get the posterior later. - left_indices[sibling->predicate()]. - push_back(offset_consecutive_siblings + r); - } else { - CHECK(sibling->predicate() < sibling->second_argument() || - (sibling->predicate() == sibling->second_argument() && - sibling->first_argument() < 0)) - << sibling->predicate() << " " - << sibling->first_argument() << " " - << sibling->second_argument(); - // Right sibling. - right_siblings[sibling->predicate()].push_back(sibling); - right_scores[sibling->predicate()].push_back( - scores[offset_consecutive_siblings + r]); - // Save the part index to get the posterior later. - right_indices[sibling->predicate()]. - push_back(offset_consecutive_siblings + r); - } - } - - // Now, go through each predicate and create left and right automata. - for (int p = 0; p < sentence->size(); ++p) { - // Build left head automaton. - if (predicate_part_indices[p].size() == 0) { - CHECK_EQ(left_arc_indices[p].size(), 0); - CHECK_EQ(right_arc_indices[p].size(), 0); - CHECK_EQ(left_siblings[p].size(), 0); - CHECK_EQ(right_siblings[p].size(), 0); - continue; - } - vector local_variables; - vector predicate_senses; - vector left_arcs; - for (int s = 0; s < predicate_part_indices[p].size(); ++s) { - int r = predicate_part_indices[p][s]; - int index = offset_predicate_variables + r - offset_predicate_parts; - local_variables.push_back(variables[index]); - SemanticPartPredicate *predicate = - static_cast((*parts)[r]); - predicate_senses.push_back(predicate); - } - for (int k = 0; k < left_arc_indices[p].size(); ++k) { - int r = left_arc_indices[p][k]; - int index = offset_arc_variables + r - offset_arcs; - local_variables.push_back(variables[index]); - SemanticPartArc *arc = - static_cast((*parts)[r]); - left_arcs.push_back(arc); - } - - AD3::FactorPredicateAutomaton *factor = new AD3::FactorPredicateAutomaton; - factor->Initialize(false, predicate_senses, left_arcs, left_siblings[p]); - factor->SetAdditionalLogPotentials(left_scores[p]); - factor_graph->DeclareFactor(factor, local_variables, true); - factor_part_indices_.push_back(-1); - additional_part_indices.insert(additional_part_indices.end(), - left_indices[p].begin(), - left_indices[p].end()); - - // Build right head automaton. - local_variables.clear(); - predicate_senses.clear(); - vector right_arcs; - for (int s = 0; s < predicate_part_indices[p].size(); ++s) { - int r = predicate_part_indices[p][s]; - CHECK_GE(r, 0); - int index = offset_predicate_variables + r - offset_predicate_parts; - local_variables.push_back(variables[index]); - SemanticPartPredicate *predicate = - static_cast((*parts)[r]); - predicate_senses.push_back(predicate); - } - for (int k = 0; k < right_arc_indices[p].size(); ++k) { - int r = right_arc_indices[p][k]; - CHECK_GE(r, 0); - int index = offset_arc_variables + r - offset_arcs; - local_variables.push_back(variables[index]); - SemanticPartArc *arc = - static_cast((*parts)[r]); - right_arcs.push_back(arc); - } - - factor = new AD3::FactorPredicateAutomaton; - factor->Initialize(true, predicate_senses, right_arcs, right_siblings[p]); - factor->SetAdditionalLogPotentials(right_scores[p]); - factor_graph->DeclareFactor(factor, local_variables, true); - factor_part_indices_.push_back(-1); - additional_part_indices.insert(additional_part_indices.end(), - right_indices[p].begin(), - right_indices[p].end()); - } - } - - ////////////////////////////////////////////////////////////////////// - // Build consecutive co-parent factors. - ////////////////////////////////////////////////////////////////////// - if (use_consecutive_coparent_parts) { - // Get all the consecutive co-parents, indices, etc. - // Note: contrarily to the consecutive siblings, here a left arc is one - // whose predicate is on the left of the argument. - vector > left_arc_indices(sentence->size()); - vector > right_arc_indices(sentence->size()); - for (int r = 0; r < num_arcs; ++r) { - SemanticPartArc* arc = - static_cast((*parts)[offset_arcs + r]); - if (arc->predicate() < arc->argument()) { - left_arc_indices[arc->argument()].push_back(offset_arcs + r); - } else { - right_arc_indices[arc->argument()].push_back(offset_arcs + r); - } - } - vector > - left_coparents(sentence->size()); - vector > - right_coparents(sentence->size()); - vector > left_scores(sentence->size()); - vector > right_scores(sentence->size()); - vector > left_indices(sentence->size()); - vector > right_indices(sentence->size()); - for (int r = 0; r < num_consecutive_coparents; ++r) { - SemanticPartConsecutiveCoparent *coparent = - static_cast( - (*parts)[offset_consecutive_coparents + r]); - if (coparent->argument() > coparent->second_predicate()) { - // Left co-parent. - left_coparents[coparent->argument()].push_back(coparent); - left_scores[coparent->argument()].push_back( - scores[offset_consecutive_coparents + r]); - // Save the part index to get the posterior later. - left_indices[coparent->argument()]. - push_back(offset_consecutive_coparents + r); - } else { - CHECK(coparent->argument() < coparent->second_predicate() || - (coparent->argument() == coparent->second_predicate() && - coparent->first_predicate() < 0)); - // Right co-parent. - right_coparents[coparent->argument()].push_back(coparent); - right_scores[coparent->argument()].push_back( - scores[offset_consecutive_coparents + r]); - // Save the part index to get the posterior later. - right_indices[coparent->argument()]. - push_back(offset_consecutive_coparents + r); - } - } - - // Now, go through each argument and create left and right automata. - for (int a = 1; a < sentence->size(); ++a) { - // Build left argument automaton. - vector local_variables; - vector left_arcs; - for (int k = 0; k < left_arc_indices[a].size(); ++k) { - int r = left_arc_indices[a][k]; - int index = offset_arc_variables + r - offset_arcs; - local_variables.push_back(variables[index]); - SemanticPartArc *arc = - static_cast((*parts)[r]); - left_arcs.push_back(arc); - } - - AD3::FactorArgumentAutomaton *factor = new AD3::FactorArgumentAutomaton; - factor->Initialize(a, false, left_arcs, left_coparents[a]); - factor->SetAdditionalLogPotentials(left_scores[a]); - factor_graph->DeclareFactor(factor, local_variables, true); - factor_part_indices_.push_back(-1); - additional_part_indices.insert(additional_part_indices.end(), - left_indices[a].begin(), - left_indices[a].end()); - - // Build right head automaton. - local_variables.clear(); - vector right_arcs; - for (int k = 0; k < right_arc_indices[a].size(); ++k) { - int r = right_arc_indices[a][k]; - int index = offset_arc_variables + r - offset_arcs; - local_variables.push_back(variables[index]); - SemanticPartArc *arc = - static_cast((*parts)[r]); - right_arcs.push_back(arc); - } - - factor = new AD3::FactorArgumentAutomaton; - factor->Initialize(a, true, right_arcs, right_coparents[a]); - factor->SetAdditionalLogPotentials(right_scores[a]); - factor_graph->DeclareFactor(factor, local_variables, true); - factor_part_indices_.push_back(-1); - additional_part_indices.insert(additional_part_indices.end(), - right_indices[a].begin(), - right_indices[a].end()); - } - } - - ////////////////////////////////////////////////////////////////////// - // Build grandparent factors. - ////////////////////////////////////////////////////////////////////// - if (use_grandparent_parts) { - for (int r = 0; r < num_grandparents; ++r) { - SemanticPartGrandparent *part = static_cast( - (*semantic_parts)[offset_grandparents + r]); - int r1 = semantic_parts->FindArc(part->grandparent_predicate(), - part->predicate(), - part->grandparent_sense()); - int r2 = semantic_parts->FindArc(part->predicate(), - part->argument(), - part->sense()); - CHECK_GE(r1, 0); - CHECK_GE(r2, 0); - vector local_variables; - local_variables.push_back(variables[r1 - offset_arcs + - offset_arc_variables]); - local_variables.push_back(variables[r2 - offset_arcs + - offset_arc_variables]); - - factor_graph->CreateFactorPAIR(local_variables, - scores[offset_grandparents + r]); - // TODO: set these global indices at the end after all variables/factors - // are created. - //factor->SetGlobalIndex(...); - additional_part_indices.push_back(offset_grandparents + r); - factor_part_indices_.push_back(offset_grandparents + r); - } - } - - ////////////////////////////////////////////////////////////////////// - // Build co-parent factors. - ////////////////////////////////////////////////////////////////////// - if (use_coparent_parts) { - for (int r = 0; r < num_coparents; ++r) { - SemanticPartCoparent *part = static_cast( - (*semantic_parts)[offset_coparents + r]); - int r1 = semantic_parts->FindArc(part->first_predicate(), - part->argument(), - part->first_sense()); - int r2 = semantic_parts->FindArc(part->second_predicate(), - part->argument(), - part->second_sense()); - CHECK_GE(r1, 0); - CHECK_GE(r2, 0); - vector local_variables; - local_variables.push_back(variables[r1 - offset_arcs + - offset_arc_variables]); - local_variables.push_back(variables[r2 - offset_arcs + - offset_arc_variables]); - - factor_graph->CreateFactorPAIR(local_variables, - scores[offset_coparents + r]); - // TODO: set these global indices at the end after all variables/factors - // are created. - //factor->SetGlobalIndex(...); - additional_part_indices.push_back(offset_coparents + r); - factor_part_indices_.push_back(offset_coparents + r); - } - } - - ////////////////////////////////////////////////////////////////////////////// - - CHECK_EQ(variables.size(), part_indices_.size()); - CHECK_EQ(factor_graph->GetNumFactors(), factor_part_indices_.size()); - - // Compute additional_part_indices_. - int offset = factor_graph->GetNumVariables(); - for (int i = 0; i < factor_graph->GetNumFactors(); ++i) { - offset += factor_graph->GetFactor(i)->GetAdditionalLogPotentials().size(); - } - CHECK_EQ(additional_part_indices.size(), - offset - factor_graph->GetNumVariables()); - // Concatenate part_indices and additional_part_indices. - part_indices_.insert(part_indices_.end(), - additional_part_indices.begin(), - additional_part_indices.end()); - - VLOG(2) << "Number of factors: " << factor_graph->GetNumFactors(); - VLOG(2) << "Number of variables: " << factor_graph->GetNumVariables(); - -#if 0 - LOG(INFO) << "Number of factors: " << factor_graph->GetNumFactors(); - LOG(INFO) << "Number of variables: " << factor_graph->GetNumVariables(); - LOG(INFO) << "Number of siblings: " << num_siblings; - LOG(INFO) << "part_indices_.size() = " << part_indices_.size(); - LOG(INFO) << "additional_part_indices.size() = " << additional_part_indices.size(); - LOG(INFO) << "factor_part_indices_.size() = " << factor_part_indices_.size(); -#endif - - vector recomputed_indices(part_indices_.size(), -1); - bool solved = false; - - //#define PRINT_GRAPH -#ifdef PRINT_GRAPH - ofstream stream; - stream.open("tmp.fg", ofstream::out | ofstream::app); - CHECK(stream.good()); - factor_graph->Print(stream); - stream << endl; - stream.flush(); - stream.clear(); - stream.close(); -#endif - - vector posteriors; - vector additional_posteriors; - double value_ref; - double *value = &value_ref; - - //factor_graph->SetMaxIterationsAD3(2000); - factor_graph->SetMaxIterationsAD3(500); - factor_graph->SetEtaAD3(0.05); - factor_graph->AdaptEtaAD3(true); - factor_graph->SetResidualThresholdAD3(1e-3); - //factor_graph->SetResidualThresholdAD3(1e-6); - +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "SemanticDecoder.h" +#include "SemanticPart.h" +#include "SemanticPipe.h" +#include "AlgUtils.h" +#include +#include +#include "logval.h" +#include "ad3/FactorGraph.h" +#include "FactorSemanticGraph.h" +#include "FactorPredicateAutomaton.h" +#include "FactorArgumentAutomaton.h" + +// Define a matrix of doubles using Eigen. +typedef LogVal LogValD; +namespace Eigen { +typedef Eigen::Matrix MatrixXlogd; +} + +using namespace std; + +DEFINE_double(srl_train_cost_false_positives, 1.0, + "Cost for predicting false positives."); +DEFINE_double(srl_train_cost_false_negatives, 1.0, + "Cost for predicting false negatives."); + +void SemanticDecoder::DecodeCostAugmented(Instance *instance, Parts *parts, + const vector &scores, + const vector &gold_output, + vector *predicted_output, + double *cost, + double *loss) { + SemanticParts *semantic_parts = static_cast(parts); + int offset_arcs, num_arcs; + + // TODO(atm): make it possible to penalize wrong predicate parts as well? + // Or unlabeled arcs in addition to labeled arcs? + if (pipe_->GetSemanticOptions()->labeled()) { + semantic_parts->GetOffsetLabeledArc(&offset_arcs, &num_arcs); + } else { + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + } + + //////////////////////////////////////////////////// + // F1: a = 0.5, b = 0.5. + // Recall: a = 0, b = 1. + // In general: + // p = a - (a+b)*z0 + // q = b*sum(z0) + // p'*z + q = a*sum(z) - (a+b)*z0'*z + b*sum(z0) + // = a*(1-z0)'*z + b*(1-z)'*z0. + //////////////////////////////////////////////////// + + // Penalty for predicting 1 when it is 0 (FP). + double a = FLAGS_srl_train_cost_false_positives; + // Penalty for predicting 0 when it is 1 (FN). + double b = FLAGS_srl_train_cost_false_negatives; + + // p = 0.5-z0, q = 0.5'*z0, loss = p'*z + q + double q = 0.0; + vector p(num_arcs, 0.0); + + vector scores_cost = scores; + for (int r = 0; r < num_arcs; ++r) { + p[r] = a - (a + b) * gold_output[offset_arcs + r]; + scores_cost[offset_arcs + r] += p[r]; + q += b*gold_output[offset_arcs + r]; + } + + Decode(instance, parts, scores_cost, predicted_output); + + *cost = q; + for (int r = 0; r < num_arcs; ++r) { + *cost += p[r] * (*predicted_output)[offset_arcs + r]; + } + + *loss = *cost; + for (int r = 0; r < parts->size(); ++r) { + *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); + } + +#if 0 + for (int k = 0; k < 2; ++k) { + const vector *output; + string type; + if (k == 0) { + type = "gold"; + output = &gold_output; + } else { + type = "predicted"; + output = predicted_output; + } + for (int r = 0; r < parts->size(); ++r) { + int offset_pred_, offset_arcs_, offset_labeled_arcs_, offset_siblings_, + offset_consecutive_siblings_, num_; + semantic_parts->GetOffsetPredicate(&offset_pred_, &num_); + semantic_parts->GetOffsetArc(&offset_arcs_, &num_); + semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs_, &num_); + semantic_parts->GetOffsetSibling(&offset_siblings_, &num_); + semantic_parts->GetOffsetConsecutiveSibling(&offset_consecutive_siblings_, &num_); + if (r >= offset_consecutive_siblings_) { + SemanticPartConsecutiveSibling *sibling = + static_cast((*parts)[r]); + if ((*output)[r] > 0) { + LOG(INFO) << type << " consec sibling: " << "[" << r << "]" << " " + << sibling->predicate() << " " + << sibling->sense() << " " + << sibling->first_argument() << " " + << sibling->second_argument() << " " + << (*output)[r]; + } + } else if (r >= offset_siblings_) { + SemanticPartSibling *sibling = + static_cast((*parts)[r]); + if ((*output)[r] > 0) { + LOG(INFO) << type << " sibling: " << "[" << r << "]" << " " + << sibling->predicate() << " " + << sibling->sense() << " " + << sibling->first_argument() << " " + << sibling->second_argument() << " " + << (*output)[r]; + } + } else if (r >= offset_labeled_arcs_) { + SemanticPartLabeledArc *labeled_arc = + static_cast((*parts)[r]); + if ((*output)[r] > 0) { + LOG(INFO) << type << " labeled_arc: " << "[" << r << "]" << " " + << labeled_arc->predicate() << " " + << labeled_arc->sense() << " " + << labeled_arc->argument() << " " + << labeled_arc->role() << " " + << (*output)[r]; + } + } else if (r >= offset_arcs_) { + SemanticPartArc *arc = + static_cast((*parts)[r]); + if ((*output)[r] > 0) { + LOG(INFO) << type << " arc: " << "[" << r << "]" << " " + << arc->predicate() << " " + << arc->sense() << " " + << arc->argument() << " " + << (*output)[r]; + } + } else if (r >= offset_pred_) { + SemanticPartPredicate *predicate = + static_cast((*parts)[r]); + if ((*output)[r] > 0) { + LOG(INFO) << type << " predicate: " << "[" << r << "]" << " " + << predicate->predicate() << " " + << predicate->sense() << " " + << (*output)[r]; + } + } else { + CHECK(false); + } + } + } +#endif +} + +void SemanticDecoder::DecodeMarginals(Instance *instance, Parts *parts, + const vector &scores, + const vector &gold_output, + vector *predicted_output, + double *entropy, + double *loss) { + SemanticParts *semantic_parts = static_cast(parts); + + // Right now, only allow marginal inference for arc-factored models. + CHECK(semantic_parts->IsArcFactored()); + + // Create copy of the scores. + vector copied_scores(scores); + vector total_scores; + vector label_marginals; + int offset_predicate_parts, num_predicate_parts; + int offset_arcs, num_arcs; + int offset_labeled_arcs, num_labeled_arcs; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs, + &num_labeled_arcs); + + // If labeled parsing, decode the labels and update the scores. + if (pipe_->GetSemanticOptions()->labeled()) { + DecodeLabelMarginals(instance, parts, copied_scores, &total_scores, + &label_marginals); + for (int r = 0; r < total_scores.size(); ++r) { + // Sum the "labeled" scores to the (eventually) already existing + // "unlabeled" scores. + copied_scores[offset_arcs + r] += total_scores[r]; + } + } + + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + + double log_partition_function; + DecodeBasicMarginals(instance, parts, copied_scores, predicted_output, + &log_partition_function, entropy); + + // If labeled parsing, write the components of the predicted output that + // correspond to the labeled parts. + if (pipe_->GetSemanticOptions()->labeled()) { + for (int r = 0; r < num_labeled_arcs; ++r) { + SemanticPartLabeledArc *labeled_arc = + static_cast( + (*parts)[offset_labeled_arcs + r]); + int index_arc = semantic_parts->FindArc(labeled_arc->predicate(), + labeled_arc->argument(), + labeled_arc->sense()); + CHECK_GE(index_arc, 0); + (*predicted_output)[offset_labeled_arcs + r] = + label_marginals[r] * (*predicted_output)[index_arc]; + } + + // Recompute the entropy. + *entropy = log_partition_function; + for (int r = 0; r < num_predicate_parts; ++r) { + *entropy -= (*predicted_output)[offset_predicate_parts + r] * + scores[offset_predicate_parts + r]; + } + for (int r = 0; r < num_arcs; ++r) { + *entropy -= (*predicted_output)[offset_arcs + r] * + scores[offset_arcs + r]; + } + for (int r = 0; r < num_labeled_arcs; ++r) { + *entropy -= (*predicted_output)[offset_labeled_arcs + r] * + scores[offset_labeled_arcs + r]; + } + if (*entropy < 0.0) { + LOG(INFO) << "Entropy truncated to zero (" << *entropy << ")"; + *entropy = 0.0; + } + } + + *loss = *entropy; + for (int r = 0; r < parts->size(); ++r) { + *loss += scores[r] * ((*predicted_output)[r] - gold_output[r]); + } + if (*loss < 0.0) { + LOG(INFO) << "Loss truncated to zero (" << *loss << ")"; + *loss = 0.0; + } +} + +// Decode the best label for each candidate arc. The output vector +// best_labeled_parts, indexed by the unlabeled arcs, contains the indices +// of the best labeled part for each arc. +void SemanticDecoder::DecodeLabels(Instance *instance, Parts *parts, + const vector &scores, + vector *best_labeled_parts) { + SemanticParts *semantic_parts = static_cast(parts); + + int offset, num_arcs; + semantic_parts->GetOffsetArc(&offset, &num_arcs); + best_labeled_parts->resize(num_arcs); + for (int r = 0; r < num_arcs; ++r) { + SemanticPartArc *arc = + static_cast((*parts)[offset + r]); + const vector &index_labeled_parts = + semantic_parts->FindLabeledArcs(arc->predicate(), + arc->argument(), + arc->sense()); + // Find the best label for each candidate arc. + int best_label = -1; + double best_score; + for (int k = 0; k < index_labeled_parts.size(); ++k) { + if (best_label < 0 || + scores[index_labeled_parts[k]] > best_score) { + best_label = index_labeled_parts[k]; + best_score = scores[best_label]; + } + } + (*best_labeled_parts)[r] = best_label; + } +} + +// Decode the label marginals for each candidate arc. The output vector +// total_scores contains the sum of exp-scores (over the labels) for each arc; +// label_marginals contains those marginals ignoring the tree constraint. +void SemanticDecoder::DecodeLabelMarginals(Instance *instance, Parts *parts, + const vector &scores, + vector *total_scores, + vector *label_marginals) { + SemanticParts *semantic_parts = static_cast(parts); + + int offset, num_arcs; + int offset_labeled, num_labeled_arcs; + semantic_parts->GetOffsetArc(&offset, &num_arcs); + semantic_parts->GetOffsetLabeledArc(&offset_labeled, &num_labeled_arcs); + total_scores->clear(); + total_scores->resize(num_arcs, 0.0); + label_marginals->clear(); + label_marginals->resize(num_labeled_arcs, 0.0); + + for (int r = 0; r < num_arcs; ++r) { + SemanticPartArc *arc = + static_cast((*parts)[offset + r]); + const vector &index_labeled_parts = + semantic_parts->FindLabeledArcs(arc->predicate(), + arc->argument(), + arc->sense()); + // Find the best label for each candidate arc. + LogValD total_score = LogValD::Zero(); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + total_score += LogValD(scores[index_labeled_parts[k]], false); + } + (*total_scores)[r] = total_score.logabs(); + double sum = 0.0; + for (int k = 0; k < index_labeled_parts.size(); ++k) { + LogValD marginal = + LogValD(scores[index_labeled_parts[k]], false) / total_score; + (*label_marginals)[index_labeled_parts[k] - offset_labeled] = + marginal.as_float(); + sum += marginal.as_float(); + } + if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) { + LOG(INFO) << "Label marginals don't sum to one: sum = " << sum; + } + } +} + +void SemanticDecoder::Decode(Instance *instance, Parts *parts, + const vector &scores, + vector *predicted_output) { + SemanticParts *semantic_parts = static_cast(parts); + + // Create copy of the scores. + vector copied_scores(scores); + vector best_labeled_parts; + int offset_labeled_arcs, num_labeled_arcs; + semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs, + &num_labeled_arcs); + int offset_arcs, num_arcs; + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + int offset_predicate_parts, num_predicate_parts; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + + bool labeled_decoding = false; + // TODO: change this test. + int offset_labeled_siblings, num_labeled_siblings; + semantic_parts->GetOffsetLabeledSibling(&offset_labeled_siblings, + &num_labeled_siblings); + if (num_labeled_siblings > 0) labeled_decoding = true; + if (pipe_->GetSemanticOptions()->deterministic_labels()) { + labeled_decoding = true; + } + + if (labeled_decoding) { + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + + DecodeFactorGraph(instance, parts, copied_scores, labeled_decoding, true, + predicted_output); + + // At test time, run a basic decoder on top of the outcome of AD3 + // as a rounding heuristic to make sure we get a valid graph. + // TODO: maybe change the interface to AD3 to let us implement + // a primal rounding heuristic. + if (pipe_->GetSemanticOptions()->test()) { + double threshold = 0.5; + for (int r = 0; r < num_arcs; ++r) { + copied_scores[offset_arcs + r] = 0.0; + } + for (int r = 0; r < num_predicate_parts; ++r) { + copied_scores[offset_predicate_parts + r] = 0.0; + } + for (int r = 0; r < num_labeled_arcs; ++r) { + copied_scores[offset_labeled_arcs + r] = + (*predicted_output)[offset_labeled_arcs + r] - threshold; + } + + DecodeLabels(instance, parts, copied_scores, &best_labeled_parts); + for (int r = 0; r < best_labeled_parts.size(); ++r) { + // Sum the "labeled" scores to the (eventually) already existing + // "unlabeled" scores. + copied_scores[offset_arcs + r] += copied_scores[best_labeled_parts[r]]; + } + + double value; + predicted_output->assign(parts->size(), 0.0); + DecodeBasic(instance, parts, copied_scores, predicted_output, &value); + + // Write the components of the predicted output that + // correspond to the labeled parts. + for (int r = 0; r < num_arcs; ++r) { + CHECK_GE(best_labeled_parts[r], offset_arcs + num_arcs); + (*predicted_output)[best_labeled_parts[r]] = + (*predicted_output)[offset_arcs + r]; + } + } + } else { + // If labeled parsing, decode the labels and update the scores. + if (pipe_->GetSemanticOptions()->labeled()) { + DecodeLabels(instance, parts, copied_scores, &best_labeled_parts); + for (int r = 0; r < best_labeled_parts.size(); ++r) { + // Sum the "labeled" scores to the (eventually) already existing + // "unlabeled" scores. + copied_scores[offset_arcs + r] += copied_scores[best_labeled_parts[r]]; + } + } + + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + + if (semantic_parts->IsArcFactored() || + semantic_parts->IsLabeledArcFactored()) { + double value; + DecodeBasic(instance, parts, copied_scores, predicted_output, &value); + } else { + DecodeFactorGraph(instance, parts, copied_scores, labeled_decoding, true, + predicted_output); + + // At test time, run a basic decoder on top of the outcome of AD3 + // as a rounding heuristic to make sure we get a valid graph. + // TODO: maybe change the interface to AD3 to let us implement + // a primal rounding heuristic. + if (pipe_->GetSemanticOptions()->test()) { + double threshold = 0.5; + for (int r = 0; r < num_arcs; ++r) { + copied_scores[offset_arcs + r] = + (*predicted_output)[offset_arcs + r] - threshold; + } + for (int r = 0; r < num_predicate_parts; ++r) { + copied_scores[offset_predicate_parts + r] = 0.0; + } + // This is not strictly necessary (since labeled arcs are not used by + // DecodeBasic), but should not harm. + for (int r = 0; r < num_labeled_arcs; ++r) { + copied_scores[offset_labeled_arcs + r] = 0.0; + } + predicted_output->assign(parts->size(), 0.0); + double value; + DecodeBasic(instance, parts, copied_scores, predicted_output, &value); + } + } + + // If labeled parsing, write the components of the predicted output that + // correspond to the labeled parts. + if (pipe_->GetSemanticOptions()->labeled()) { + for (int r = 0; r < num_arcs; ++r) { + CHECK_GE(best_labeled_parts[r], offset_arcs + num_arcs); + (*predicted_output)[best_labeled_parts[r]] = + (*predicted_output)[offset_arcs + r]; + } + } + } +} + +// Build predicate and arc indices. +void SemanticDecoder::BuildBasicIndices( + int sentence_length, + const vector &predicate_parts, + const vector &arcs, + vector > *index_predicates, + vector > > *arcs_by_predicate) { + int num_arcs = arcs.size(); + int num_predicate_parts = predicate_parts.size(); + + arcs_by_predicate->assign(sentence_length, vector >()); + for (int r = 0; r < num_arcs; ++r) { + int p = arcs[r]->predicate(); + int s = arcs[r]->sense(); + if (s >= (*arcs_by_predicate)[p].size()) { + (*arcs_by_predicate)[p].resize(s + 1); + } + (*arcs_by_predicate)[p][s].push_back(r); + } + + index_predicates->assign(sentence_length, vector()); + for (int r = 0; r < num_predicate_parts; ++r) { + CHECK_LT(r, predicate_parts.size()); + int p = predicate_parts[r]->predicate(); + int s = predicate_parts[r]->sense(); + if (s >= (*index_predicates)[p].size()) { + (*index_predicates)[p].resize(s + 1, -1); + } + (*index_predicates)[p][s] = r; + } +} + +void SemanticDecoder::DecodePruner(Instance *instance, Parts *parts, + const vector &scores, + vector *predicted_output) { + //DecodePrunerNaive(instance, parts, scores, predicted_output); + //return; + + int sentence_length = + static_cast(instance)->size(); + SemanticParts *semantic_parts = static_cast(parts); + double posterior_threshold = + pipe_->GetSemanticOptions()->GetPrunerPosteriorThreshold(); + int max_arguments = pipe_->GetSemanticOptions()->GetPrunerMaxArguments(); + if (max_arguments < 0) max_arguments = sentence_length; + + int offset_predicate_parts, num_predicate_parts; + int offset_arcs, num_arcs; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + + vector arcs(num_arcs); + vector scores_arcs(num_arcs); + for (int r = 0; r < num_arcs; ++r) { + arcs[r] = static_cast((*parts)[offset_arcs + r]); + scores_arcs[r] = scores[offset_arcs + r]; + } + + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + + CHECK(semantic_parts->IsArcFactored()); + + vector > > arcs_by_predicate; + arcs_by_predicate.resize(sentence_length); + for (int r = 0; r < num_arcs; ++r) { + int p = arcs[r]->predicate(); + int s = arcs[r]->sense(); + if (s >= arcs_by_predicate[p].size()) { + arcs_by_predicate[p].resize(s + 1); + } + arcs_by_predicate[p][s].push_back(r); + } + + double entropy; + double log_partition_function; + vector posteriors; + DecodeBasicMarginals(instance, parts, scores, &posteriors, + &log_partition_function, &entropy); + + // Get max_arguments argumens per predicate. + int num_used_parts = 0; + for (int p = 0; p < sentence_length; ++p) { + for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { + vector > scores_arguments; + for (int a = 1; a < sentence_length; ++a) { + int r = semantic_parts->FindArc(p, a, s); + if (r < 0) continue; + scores_arguments.push_back(pair(-posteriors[r], r)); + } + if (scores_arguments.size() == 0) continue; + sort(scores_arguments.begin(), scores_arguments.end()); + double max_posterior = 1.0; // -scores_arguments[0].first; + for (int k = 0; k < max_arguments && k < scores_arguments.size(); ++k) { + int r = scores_arguments[k].second; + if (-scores_arguments[k].first >= posterior_threshold * max_posterior) { + ++num_used_parts; + (*predicted_output)[r] = 1.0; + } else { + break; + } + } + } + } + + VLOG(2) << "Pruning reduced to " + << static_cast(num_used_parts) / + static_cast(sentence_length) + << " candidate heads per word."; +} + +void SemanticDecoder::DecodePrunerNaive(Instance *instance, Parts *parts, + const vector &scores, + vector *predicted_output) { + int sentence_length = + static_cast(instance)->size(); + SemanticParts *semantic_parts = static_cast(parts); + int max_arguments = pipe_->GetSemanticOptions()->GetPrunerMaxArguments(); + int offset_predicate_parts, num_predicate_parts; + int offset_arcs, num_arcs; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + + vector arcs(num_arcs); + vector scores_arcs(num_arcs); + for (int r = 0; r < num_arcs; ++r) { + arcs[r] = static_cast((*parts)[offset_arcs + r]); + scores_arcs[r] = scores[offset_arcs + r]; + } + + predicted_output->clear(); + predicted_output->resize(parts->size(), 0.0); + for (int r = 0; r < num_predicate_parts; ++r) { + // Don't prune any of the predicate parts. + (*predicted_output)[offset_predicate_parts + r] = 1.0; + } + + CHECK(semantic_parts->IsArcFactored()); + + vector > > arcs_by_predicate; + arcs_by_predicate.resize(sentence_length); + for (int r = 0; r < num_arcs; ++r) { + int p = arcs[r]->predicate(); + int s = arcs[r]->sense(); + if (s >= arcs_by_predicate[p].size()) { + arcs_by_predicate[p].resize(s + 1); + } + arcs_by_predicate[p][s].push_back(r); + } + + // Get max_arguments argumens per predicate. + for (int p = 0; p < sentence_length; ++p) { + for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { + vector > scores_arguments; + for (int a = 1; a < sentence_length; ++a) { + int r = semantic_parts->FindArc(p, a, s); + if (r < 0) continue; + scores_arguments.push_back(pair(-scores[r], r)); + } + sort(scores_arguments.begin(), scores_arguments.end()); + for (int k = 0; k < max_arguments && k < scores_arguments.size(); ++k) { + int r = scores_arguments[k].second; + (*predicted_output)[r] = 1.0; + //LOG(INFO) << "Keeping arc (" << p << ", " + // << s << ", " + // << static_cast((*parts)[r])->argument() + // << ")."; + } + } + } +} + +// Decoder for the basic model. For each predicate, choose the best +// sense and the best set of arcs independently. +void SemanticDecoder::DecodeBasic(Instance *instance, Parts *parts, + const vector &scores, + vector *predicted_output, + double *value) { + int sentence_length = + static_cast(instance)->size(); + SemanticParts *semantic_parts = static_cast(parts); + int offset_predicate_parts, num_predicate_parts; + int offset_arcs, num_arcs; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + + vector arcs(num_arcs); + vector scores_arcs(num_arcs); + for (int r = 0; r < num_arcs; ++r) { + arcs[r] = static_cast((*parts)[offset_arcs + r]); + scores_arcs[r] = scores[offset_arcs + r]; + } + + vector predicate_parts(num_predicate_parts); + vector scores_predicates(num_predicate_parts); + for (int r = 0; r < num_predicate_parts; ++r) { + predicate_parts[r] = + static_cast((*parts)[offset_predicate_parts + r]); + scores_predicates[r] = scores[offset_predicate_parts + r]; + } + + vector > > arcs_by_predicate; + vector > index_predicates; + vector selected_predicates; + vector selected_arcs; + BuildBasicIndices(sentence_length, predicate_parts, arcs, &index_predicates, + &arcs_by_predicate); + + DecodeSemanticGraph(sentence_length, predicate_parts, arcs, index_predicates, + arcs_by_predicate, scores_predicates, scores_arcs, + &selected_predicates, &selected_arcs, value); + + predicted_output->resize(parts->size()); + for (int r = 0; r < num_predicate_parts; ++r) { + if (selected_predicates[r]) { + (*predicted_output)[offset_predicate_parts + r] = 1.0; + } else { + (*predicted_output)[offset_predicate_parts + r] = 0.0; + } + } + for (int r = 0; r < num_arcs; ++r) { + if (selected_arcs[r]) { + (*predicted_output)[offset_arcs + r] = 1.0; + } else { + (*predicted_output)[offset_arcs + r] = 0.0; + } + } + +#if 0 + arcs_by_predicate.resize(sentence_length); + for (int r = 0; r < num_arcs; ++r) { + int p = arcs[r]->predicate(); + int s = arcs[r]->sense(); + if (s >= arcs_by_predicate[p].size()) { + arcs_by_predicate[p].resize(s + 1); + } + arcs_by_predicate[p][s].push_back(r); + } + + vector scores_predicates(num_predicate_parts); + //vector > index_predicates(sentence_length); + for (int r = 0; r < num_predicate_parts; ++r) { + scores_predicates[r] = scores[offset_predicate_parts + r]; + SemanticPartPredicate *predicate_part = + static_cast((*parts)[offset_predicate_parts + r]); + int p = predicate_part->predicate(); + int s = predicate_part->sense(); + if (s >= index_predicates[p].size()) { + index_predicates[p].resize(s + 1, -1); + } + index_predicates[p][s] = r; + } + + predicted_output->resize(parts->size()); + for (int r = 0; r < num_predicate_parts; ++r) { + (*predicted_output)[offset_predicate_parts + r] = 0.0; + } + for (int r = 0; r < num_arcs; ++r) { + (*predicted_output)[offset_arcs + r] = 0.0; + } + + double total_score = 0.0; + for (int p = 0; p < sentence_length; ++p) { + int best_sense = -1; + double best_score = 0.0; + vector > selected_arcs; + selected_arcs.resize(arcs_by_predicate[p].size()); + for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { + // Compute the best assignment of arcs departing from this + // predicate word. + selected_arcs[s].resize(arcs_by_predicate[p][s].size()); + int r = index_predicates[p][s]; + double score = scores_predicates[r]; + for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { + int r = arcs_by_predicate[p][s][k]; + if (scores_arcs[r] > 0.0) { + selected_arcs[s][k] = true; + score += scores_arcs[r]; + } else { + selected_arcs[s][k] = false; + } + } + if (score > best_score) { + best_sense = s; + best_score = score; + } + } + if (best_sense >= 0) { + total_score += best_score; + int s = best_sense; + int r = index_predicates[p][s]; + CHECK_GE(r, 0); + (*predicted_output)[offset_predicate_parts + r] = 1.0; + for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { + if (!selected_arcs[s][k]) continue; + int r = arcs_by_predicate[p][s][k]; + (*predicted_output)[offset_arcs + r] = 1.0; + //LOG(INFO) << "Selected arc " + // << arcs[r]->predicate() << " " + // << arcs[r]->argument() << " " + // << arcs[r]->sense(); + } + } + } +#endif +} + +// Decoder for the basic model. For each predicate, choose the best +// sense and the best set of arcs independently. +void SemanticDecoder::DecodeBasicMarginals(Instance *instance, Parts *parts, + const vector &scores, + vector *predicted_output, + double *log_partition_function, + double *entropy) { + int sentence_length = + static_cast(instance)->size(); + SemanticParts *semantic_parts = static_cast(parts); + int offset_predicate_parts, num_predicate_parts; + int offset_arcs, num_arcs; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + + vector arcs(num_arcs); + vector scores_arcs(num_arcs); + for (int r = 0; r < num_arcs; ++r) { + arcs[r] = static_cast((*parts)[offset_arcs + r]); + scores_arcs[r] = scores[offset_arcs + r]; + } + + vector > > arcs_by_predicate; + arcs_by_predicate.resize(sentence_length); + for (int r = 0; r < num_arcs; ++r) { + int p = arcs[r]->predicate(); + int s = arcs[r]->sense(); + if (s >= arcs_by_predicate[p].size()) { + arcs_by_predicate[p].resize(s + 1); + } + arcs_by_predicate[p][s].push_back(r); + } + + vector scores_predicates(num_predicate_parts); + vector > index_predicates(sentence_length); + for (int r = 0; r < num_predicate_parts; ++r) { + scores_predicates[r] = scores[offset_predicate_parts + r]; + SemanticPartPredicate *predicate_part = + static_cast((*parts)[offset_predicate_parts + r]); + int p = predicate_part->predicate(); + int s = predicate_part->sense(); + if (s >= index_predicates[p].size()) { + index_predicates[p].resize(s + 1, -1); + } + index_predicates[p][s] = r; + } + + predicted_output->resize(parts->size()); + for (int r = 0; r < num_predicate_parts; ++r) { + (*predicted_output)[offset_predicate_parts + r] = 0.0; + } + for (int r = 0; r < num_arcs; ++r) { + (*predicted_output)[offset_arcs + r] = 0.0; + } + + *log_partition_function = 0.0; + *entropy = 0.0; + for (int p = 0; p < sentence_length; ++p) { + // Initiliaze log partition all senses to exp(0.0) to account for + // the null sense which implies there are no outgoing arcs. + LogValD log_partition_all_senses = LogValD::One(); + //LogValD log_partition_all_senses = LogValD::Zero(); + vector log_partition_senses(arcs_by_predicate[p].size(), + LogValD::Zero()); + vector > log_partition_arcs(arcs_by_predicate[p].size()); + for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { + int r = index_predicates[p][s]; + double score = scores_predicates[r]; + //CHECK_EQ(score, 0.0); + // Initialize log partition arcs to exp(0.0) to account for the + // event that the arc does not exist. + log_partition_arcs[s].assign(arcs_by_predicate[p][s].size(), + LogValD::One()); + for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { + int r = arcs_by_predicate[p][s][k]; + log_partition_arcs[s][k] += LogValD(scores_arcs[r], false); + //score += log_partition_arcs[s][k].as_float(); + score += log_partition_arcs[s][k].logabs(); + } + //LOG(INFO) << s << " " << score; + log_partition_senses[s] = LogValD(score, false); + //log_partition_senses[s] = LogValD(score); + //LOG(INFO) << s << " " << log_partition_senses[s].logabs(); + log_partition_all_senses += log_partition_senses[s]; + } + + // This makes sure the log partition function does not become -infty for + // predicates that do not have any sense. + if (arcs_by_predicate[p].size() > 0) { + if (false && sentence_length < 5) { + LOG(INFO) << "Log partition[" << p << "] = " + << log_partition_all_senses.logabs(); + } + *log_partition_function += log_partition_all_senses.logabs(); + } + + for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { + int r = index_predicates[p][s]; + double predicate_marginal = LogValD(log_partition_senses[s].logabs() - + log_partition_all_senses.logabs(), + false).as_float(); + //LOG(INFO) << "Predicate marginal[" << p << "][" << s << "] = " + // << predicate_marginal; + (*predicted_output)[offset_predicate_parts + r] = predicate_marginal; + //CHECK_EQ(scores_predicates[r], 0.0); + *entropy -= scores_predicates[r] * predicate_marginal; + for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { + int r = arcs_by_predicate[p][s][k]; + double marginal = LogValD(scores_arcs[r] - + log_partition_arcs[s][k].logabs(), + false).as_float(); + marginal *= predicate_marginal; + if (false && sentence_length < 5) { + LOG(INFO) << "marginal[" << p << "][" << s << "][" << k << "] = " + << marginal << "\t" + << "scores_arcs[" << p << "][" << s << "][" << k << "] = " + << scores_arcs[r]; + } + (*predicted_output)[offset_arcs + r] = marginal; + *entropy -= scores_arcs[r] * marginal; + } + } + } + + *entropy += *log_partition_function; + if (false && sentence_length < 5) { + LOG(INFO) << "Log-partition function:" << *log_partition_function; + LOG(INFO) << "Entropy:" << *entropy; + } +} + +// Decoder for the basic model. For each predicate, choose the best +// sense and the best set of arcs independently. +void SemanticDecoder::DecodeSemanticGraph( + int sentence_length, + const vector &predicate_parts, + const vector &arcs, + const vector > &index_predicates, + const vector > > &arcs_by_predicate, + const vector &predicate_scores, + const vector &arc_scores, + vector *selected_predicates, + vector *selected_arcs, + double *value) { + int num_predicate_parts = predicate_parts.size(); + int num_arcs = arcs.size(); + + selected_predicates->assign(num_predicate_parts, false); + selected_arcs->assign(num_arcs, false); + + double total_score = 0.0; + for (int p = 0; p < sentence_length; ++p) { + int best_sense = -1; + double best_score = 0.0; + vector > selected; + selected.resize(arcs_by_predicate[p].size()); + for (int s = 0; s < arcs_by_predicate[p].size(); ++s) { + // Compute the best assignment of arcs departing from this + // predicate word. + selected[s].resize(arcs_by_predicate[p][s].size()); + int r = index_predicates[p][s]; + double score = predicate_scores[r]; + for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { + int r = arcs_by_predicate[p][s][k]; + if (arc_scores[r] > 0.0) { + selected[s][k] = true; + score += arc_scores[r]; + } else { + selected[s][k] = false; + } + } + // Note: we're allowing a non-null sense (!= -1) without outgoing arcs. + if (score > best_score) { + best_sense = s; + best_score = score; + } + } + if (best_sense >= 0) { + total_score += best_score; + int s = best_sense; + int r = index_predicates[p][s]; + CHECK_GE(r, 0); + (*selected_predicates)[r] = true; + for (int k = 0; k < arcs_by_predicate[p][s].size(); ++k) { + if (!selected[s][k]) continue; + int r = arcs_by_predicate[p][s][k]; + (*selected_arcs)[r] = true; + } + } + } + *value = total_score; +} + +// Decode building a factor graph and calling the AD3 algorithm. +void SemanticDecoder::DecodeFactorGraph(Instance *instance, Parts *parts, + const vector &scores, + bool labeled_decoding, + bool relax, + vector *predicted_output) { + SemanticParts *semantic_parts = static_cast(parts); + SemanticInstanceNumeric* sentence = + static_cast(instance); + CHECK(relax); + + // Get the offsets for the different parts. + int offset_predicate_parts, num_predicate_parts; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + int offset_arcs, num_arcs; + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + int offset_labeled_arcs, num_labeled_arcs; + semantic_parts->GetOffsetLabeledArc(&offset_labeled_arcs, + &num_labeled_arcs); + int offset_siblings, num_siblings; + semantic_parts->GetOffsetSibling(&offset_siblings, &num_siblings); + int offset_labeled_siblings, num_labeled_siblings; + semantic_parts->GetOffsetLabeledSibling(&offset_labeled_siblings, + &num_labeled_siblings); + int offset_consecutive_siblings, num_consecutive_siblings; + semantic_parts->GetOffsetConsecutiveSibling(&offset_consecutive_siblings, + &num_consecutive_siblings); + int offset_grandparents, num_grandparents; + semantic_parts->GetOffsetGrandparent(&offset_grandparents, &num_grandparents); + int offset_coparents, num_coparents; + semantic_parts->GetOffsetCoparent(&offset_coparents, &num_coparents); + int offset_consecutive_coparents, num_consecutive_coparents; + semantic_parts->GetOffsetConsecutiveCoparent(&offset_consecutive_coparents, + &num_consecutive_coparents); + +#if 0 + int offset_grandsiblings, num_grandsiblings; + semantic_parts->GetOffsetGrandSibling(&offset_grandsiblings, &num_grandsiblings); + int offset_trisiblings, num_trisiblings; + dependency_parts->GetOffsetTriSibling(&offset_trisiblings, &num_trisiblings); +#endif + + // Define what parts are used. + bool use_arbitrary_sibling_parts = (num_siblings > 0); + bool use_labeled_arbitrary_sibling_parts = (num_labeled_siblings > 0); + bool use_consecutive_sibling_parts = (num_consecutive_siblings > 0); + bool use_grandparent_parts = (num_grandparents > 0); + bool use_coparent_parts = (num_coparents > 0); + bool use_consecutive_coparent_parts = (num_consecutive_coparents > 0); +#if 0 + bool use_grandsibling_parts = (num_grandsiblings > 0); + bool use_trisibling_parts = (num_trisiblings > 0); +#endif + + if (!labeled_decoding) { + CHECK_EQ(num_labeled_siblings, 0); + CHECK(!pipe_->GetSemanticOptions()->deterministic_labels()); + } + + // Variables of the factor graph. + vector variables; + + // Indices that allow to identify the part corresponding to each variable. + vector part_indices_; + vector additional_part_indices; + vector factor_part_indices_; + + // Create factor graph. + AD3::FactorGraph *factor_graph = new AD3::FactorGraph; + int verbosity = 1; //1; + if (VLOG_IS_ON(2)) { + verbosity = 2; + } + factor_graph->SetVerbosity(verbosity); + + // Build predicate part variables. + int offset_predicate_variables = variables.size(); + for (int r = 0; r < num_predicate_parts; ++r) { + AD3::BinaryVariable* variable = factor_graph->CreateBinaryVariable(); + variable->SetLogPotential(scores[offset_predicate_parts + r]); + variables.push_back(variable); + part_indices_.push_back(offset_predicate_parts + r); + } + + // Build arc variables. + int offset_arc_variables = variables.size(); + for (int r = 0; r < num_arcs; ++r) { + AD3::BinaryVariable* variable = factor_graph->CreateBinaryVariable(); + variable->SetLogPotential(scores[offset_arcs + r]); + variables.push_back(variable); + part_indices_.push_back(offset_arcs + r); + } + + int offset_labeled_arc_variables = variables.size(); + if (labeled_decoding) { + // Build labeled arc variables. + for (int r = 0; r < num_labeled_arcs; ++r) { + AD3::BinaryVariable* variable = factor_graph->CreateBinaryVariable(); + variable->SetLogPotential(scores[offset_labeled_arcs + r]); + variables.push_back(variable); + part_indices_.push_back(offset_labeled_arcs + r); + } + } + + // Build basic semantic graph factor. + vector local_variables(num_predicate_parts + num_arcs); + vector predicate_parts(num_predicate_parts); + for (int r = 0; r < num_predicate_parts; ++r) { + local_variables[r] = variables[offset_predicate_variables + r]; + predicate_parts[r] = + static_cast((*parts)[offset_predicate_parts + r]); + } + vector arcs(num_arcs); + for (int r = 0; r < num_arcs; ++r) { + local_variables[num_predicate_parts + r] = + variables[offset_arc_variables + r]; + arcs[r] = static_cast((*parts)[offset_arcs + r]); + } + AD3::FactorSemanticGraph *factor = new AD3::FactorSemanticGraph; + factor->Initialize(sentence->size(), predicate_parts, arcs, this); + factor_graph->DeclareFactor(factor, local_variables, true); + factor_part_indices_.push_back(-1); + + if (labeled_decoding) { + // Build XOR-OUT factors to impose that each arc has a unique label. + for (int r = 0; r < num_arcs; ++r) { + const vector &index_labeled_parts = + semantic_parts->GetLabeledParts(offset_arcs + r); + vector + local_variables(index_labeled_parts.size() + 1); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int index_part = index_labeled_parts[k]; + CHECK_GE(index_part, 0); + CHECK_LT(offset_labeled_arc_variables + index_part - offset_labeled_arcs, + variables.size()); + local_variables[k] = variables[offset_labeled_arc_variables + + index_part - offset_labeled_arcs]; + } + CHECK_GE(offset_arc_variables + r, 0); + CHECK_LT(offset_arc_variables + r, variables.size()); + local_variables[index_labeled_parts.size()] = + variables[offset_arc_variables + r]; + factor_graph->CreateFactorXOROUT(local_variables); + factor_part_indices_.push_back(-1); + } + + // If some labels are deterministic, make sure the same role is not filled + // more than once for each predicate. + if (pipe_->GetSemanticOptions()->deterministic_labels()) { + int sentence_length = + static_cast(instance)->size(); + vector > > + labeled_arcs_by_predicate_role(sentence_length); + for (int r = 0; r < num_labeled_arcs; ++r) { + SemanticPartLabeledArc *labeled_arc = + static_cast( + (*semantic_parts)[offset_labeled_arcs + r]); + int p = labeled_arc->predicate(); + int l = labeled_arc->role(); + // Skip if role l is not deterministic. + if (!pipe_->GetSemanticDictionary()->IsRoleDeterministic(l)) continue; + if (labeled_arcs_by_predicate_role[p].size() <= l) { + labeled_arcs_by_predicate_role[p].resize(l + 1); + } + labeled_arcs_by_predicate_role[p][l].push_back(r); + } + for (int p = 0; p < labeled_arcs_by_predicate_role.size(); ++p) { + for (int l = 0; l < labeled_arcs_by_predicate_role[p].size(); ++l) { + if (labeled_arcs_by_predicate_role[p][l].size() <= 1) continue; + vector + local_variables(labeled_arcs_by_predicate_role[p][l].size()); + for (int k = 0; k < labeled_arcs_by_predicate_role[p][l].size(); + ++k) { + int r = labeled_arcs_by_predicate_role[p][l][k]; + local_variables[k] = variables[offset_labeled_arc_variables + r]; + } + factor_graph->CreateFactorAtMostOne(local_variables); + factor_part_indices_.push_back(-1); + } + } + } + } + + ////////////////////////////////////////////////////////////////////// + // Build sibling factors. + ////////////////////////////////////////////////////////////////////// + if (use_arbitrary_sibling_parts) { + for (int r = 0; r < num_siblings; ++r) { + SemanticPartSibling *part = static_cast( + (*semantic_parts)[offset_siblings + r]); + int r1 = semantic_parts->FindArc(part->predicate(), + part->first_argument(), + part->sense()); + int r2 = semantic_parts->FindArc(part->predicate(), + part->second_argument(), + part->sense()); + CHECK_GE(r1, 0); + CHECK_GE(r2, 0); + vector local_variables; + local_variables.push_back(variables[r1 - offset_arcs + + offset_arc_variables]); + local_variables.push_back(variables[r2 - offset_arcs + + offset_arc_variables]); + + factor_graph->CreateFactorPAIR(local_variables, + scores[offset_siblings + r]); + // TODO: set these global indices at the end after all variables/factors + // are created. + //factor->SetGlobalIndex(...); + additional_part_indices.push_back(offset_siblings + r); + factor_part_indices_.push_back(offset_siblings + r); + } + } + + ////////////////////////////////////////////////////////////////////// + // Build labeled sibling factors. + ////////////////////////////////////////////////////////////////////// + if (use_labeled_arbitrary_sibling_parts) { + CHECK(labeled_decoding); + for (int r = 0; r < num_labeled_siblings; ++r) { + SemanticPartLabeledSibling *part = + static_cast( + (*semantic_parts)[offset_labeled_siblings + r]); + int r1 = semantic_parts->FindLabeledArc(part->predicate(), + part->first_argument(), + part->sense(), + part->first_role()); + int r2 = semantic_parts->FindLabeledArc(part->predicate(), + part->second_argument(), + part->sense(), + part->second_role()); + CHECK_GE(r1, 0); + CHECK_GE(r2, 0); + vector local_variables; + local_variables.push_back(variables[r1 - offset_labeled_arcs + + offset_labeled_arc_variables]); + local_variables.push_back(variables[r2 - offset_labeled_arcs + + offset_labeled_arc_variables]); + + factor_graph->CreateFactorPAIR(local_variables, + scores[offset_labeled_siblings + r]); + // TODO: set these global indices at the end after all variables/factors + // are created. + //factor->SetGlobalIndex(...); + additional_part_indices.push_back(offset_labeled_siblings + r); + factor_part_indices_.push_back(offset_labeled_siblings + r); + } + } + + ////////////////////////////////////////////////////////////////////// + // Build consecutive sibling factors. + ////////////////////////////////////////////////////////////////////// + if (use_consecutive_sibling_parts) { + // Get all the consecutive siblings, indices, etc. + vector > predicate_part_indices(sentence->size()); + for (int r = 0; r < num_predicate_parts; ++r) { + SemanticPartPredicate* predicate_part = + static_cast( + (*parts)[offset_predicate_parts + r]); + predicate_part_indices[predicate_part->predicate()]. + push_back(offset_predicate_parts + r); + } + vector > left_arc_indices(sentence->size()); + vector > right_arc_indices(sentence->size()); + for (int r = 0; r < num_arcs; ++r) { + SemanticPartArc* arc = + static_cast((*parts)[offset_arcs + r]); + // Handle self-loops (p=a) in the right side automaton. + if (arc->predicate() > arc->argument()) { + left_arc_indices[arc->predicate()].push_back(offset_arcs + r); + } else { + right_arc_indices[arc->predicate()].push_back(offset_arcs + r); + } + } + vector > + left_siblings(sentence->size()); + vector > + right_siblings(sentence->size()); + vector > left_scores(sentence->size()); + vector > right_scores(sentence->size()); + vector > left_indices(sentence->size()); + vector > right_indices(sentence->size()); + for (int r = 0; r < num_consecutive_siblings; ++r) { + SemanticPartConsecutiveSibling *sibling = + static_cast( + (*parts)[offset_consecutive_siblings + r]); + // TODO: Try to disable self loops on the left side? + // Make sure no non-basic sibling part ends up in two + // factors. + if (sibling->predicate() > sibling->second_argument()) { + // Left sibling. + left_siblings[sibling->predicate()].push_back(sibling); + left_scores[sibling->predicate()].push_back( + scores[offset_consecutive_siblings + r]); + // Save the part index to get the posterior later. + left_indices[sibling->predicate()]. + push_back(offset_consecutive_siblings + r); + } else { + CHECK(sibling->predicate() < sibling->second_argument() || + (sibling->predicate() == sibling->second_argument() && + sibling->first_argument() < 0)) + << sibling->predicate() << " " + << sibling->first_argument() << " " + << sibling->second_argument(); + // Right sibling. + right_siblings[sibling->predicate()].push_back(sibling); + right_scores[sibling->predicate()].push_back( + scores[offset_consecutive_siblings + r]); + // Save the part index to get the posterior later. + right_indices[sibling->predicate()]. + push_back(offset_consecutive_siblings + r); + } + } + + // Now, go through each predicate and create left and right automata. + for (int p = 0; p < sentence->size(); ++p) { + // Build left head automaton. + if (predicate_part_indices[p].size() == 0) { + CHECK_EQ(left_arc_indices[p].size(), 0); + CHECK_EQ(right_arc_indices[p].size(), 0); + CHECK_EQ(left_siblings[p].size(), 0); + CHECK_EQ(right_siblings[p].size(), 0); + continue; + } + vector local_variables; + vector predicate_senses; + vector left_arcs; + for (int s = 0; s < predicate_part_indices[p].size(); ++s) { + int r = predicate_part_indices[p][s]; + int index = offset_predicate_variables + r - offset_predicate_parts; + local_variables.push_back(variables[index]); + SemanticPartPredicate *predicate = + static_cast((*parts)[r]); + predicate_senses.push_back(predicate); + } + for (int k = 0; k < left_arc_indices[p].size(); ++k) { + int r = left_arc_indices[p][k]; + int index = offset_arc_variables + r - offset_arcs; + local_variables.push_back(variables[index]); + SemanticPartArc *arc = + static_cast((*parts)[r]); + left_arcs.push_back(arc); + } + + AD3::FactorPredicateAutomaton *factor = new AD3::FactorPredicateAutomaton; + factor->Initialize(false, predicate_senses, left_arcs, left_siblings[p]); + factor->SetAdditionalLogPotentials(left_scores[p]); + factor_graph->DeclareFactor(factor, local_variables, true); + factor_part_indices_.push_back(-1); + additional_part_indices.insert(additional_part_indices.end(), + left_indices[p].begin(), + left_indices[p].end()); + + // Build right head automaton. + local_variables.clear(); + predicate_senses.clear(); + vector right_arcs; + for (int s = 0; s < predicate_part_indices[p].size(); ++s) { + int r = predicate_part_indices[p][s]; + CHECK_GE(r, 0); + int index = offset_predicate_variables + r - offset_predicate_parts; + local_variables.push_back(variables[index]); + SemanticPartPredicate *predicate = + static_cast((*parts)[r]); + predicate_senses.push_back(predicate); + } + for (int k = 0; k < right_arc_indices[p].size(); ++k) { + int r = right_arc_indices[p][k]; + CHECK_GE(r, 0); + int index = offset_arc_variables + r - offset_arcs; + local_variables.push_back(variables[index]); + SemanticPartArc *arc = + static_cast((*parts)[r]); + right_arcs.push_back(arc); + } + + factor = new AD3::FactorPredicateAutomaton; + factor->Initialize(true, predicate_senses, right_arcs, right_siblings[p]); + factor->SetAdditionalLogPotentials(right_scores[p]); + factor_graph->DeclareFactor(factor, local_variables, true); + factor_part_indices_.push_back(-1); + additional_part_indices.insert(additional_part_indices.end(), + right_indices[p].begin(), + right_indices[p].end()); + } + } + + ////////////////////////////////////////////////////////////////////// + // Build consecutive co-parent factors. + ////////////////////////////////////////////////////////////////////// + if (use_consecutive_coparent_parts) { + // Get all the consecutive co-parents, indices, etc. + // Note: contrarily to the consecutive siblings, here a left arc is one + // whose predicate is on the left of the argument. + vector > left_arc_indices(sentence->size()); + vector > right_arc_indices(sentence->size()); + for (int r = 0; r < num_arcs; ++r) { + SemanticPartArc* arc = + static_cast((*parts)[offset_arcs + r]); + if (arc->predicate() < arc->argument()) { + left_arc_indices[arc->argument()].push_back(offset_arcs + r); + } else { + right_arc_indices[arc->argument()].push_back(offset_arcs + r); + } + } + vector > + left_coparents(sentence->size()); + vector > + right_coparents(sentence->size()); + vector > left_scores(sentence->size()); + vector > right_scores(sentence->size()); + vector > left_indices(sentence->size()); + vector > right_indices(sentence->size()); + for (int r = 0; r < num_consecutive_coparents; ++r) { + SemanticPartConsecutiveCoparent *coparent = + static_cast( + (*parts)[offset_consecutive_coparents + r]); + if (coparent->argument() > coparent->second_predicate()) { + // Left co-parent. + left_coparents[coparent->argument()].push_back(coparent); + left_scores[coparent->argument()].push_back( + scores[offset_consecutive_coparents + r]); + // Save the part index to get the posterior later. + left_indices[coparent->argument()]. + push_back(offset_consecutive_coparents + r); + } else { + CHECK(coparent->argument() < coparent->second_predicate() || + (coparent->argument() == coparent->second_predicate() && + coparent->first_predicate() < 0)); + // Right co-parent. + right_coparents[coparent->argument()].push_back(coparent); + right_scores[coparent->argument()].push_back( + scores[offset_consecutive_coparents + r]); + // Save the part index to get the posterior later. + right_indices[coparent->argument()]. + push_back(offset_consecutive_coparents + r); + } + } + + // Now, go through each argument and create left and right automata. + for (int a = 1; a < sentence->size(); ++a) { + // Build left argument automaton. + vector local_variables; + vector left_arcs; + for (int k = 0; k < left_arc_indices[a].size(); ++k) { + int r = left_arc_indices[a][k]; + int index = offset_arc_variables + r - offset_arcs; + local_variables.push_back(variables[index]); + SemanticPartArc *arc = + static_cast((*parts)[r]); + left_arcs.push_back(arc); + } + + AD3::FactorArgumentAutomaton *factor = new AD3::FactorArgumentAutomaton; + factor->Initialize(a, false, left_arcs, left_coparents[a]); + factor->SetAdditionalLogPotentials(left_scores[a]); + factor_graph->DeclareFactor(factor, local_variables, true); + factor_part_indices_.push_back(-1); + additional_part_indices.insert(additional_part_indices.end(), + left_indices[a].begin(), + left_indices[a].end()); + + // Build right head automaton. + local_variables.clear(); + vector right_arcs; + for (int k = 0; k < right_arc_indices[a].size(); ++k) { + int r = right_arc_indices[a][k]; + int index = offset_arc_variables + r - offset_arcs; + local_variables.push_back(variables[index]); + SemanticPartArc *arc = + static_cast((*parts)[r]); + right_arcs.push_back(arc); + } + + factor = new AD3::FactorArgumentAutomaton; + factor->Initialize(a, true, right_arcs, right_coparents[a]); + factor->SetAdditionalLogPotentials(right_scores[a]); + factor_graph->DeclareFactor(factor, local_variables, true); + factor_part_indices_.push_back(-1); + additional_part_indices.insert(additional_part_indices.end(), + right_indices[a].begin(), + right_indices[a].end()); + } + } + + ////////////////////////////////////////////////////////////////////// + // Build grandparent factors. + ////////////////////////////////////////////////////////////////////// + if (use_grandparent_parts) { + for (int r = 0; r < num_grandparents; ++r) { + SemanticPartGrandparent *part = static_cast( + (*semantic_parts)[offset_grandparents + r]); + int r1 = semantic_parts->FindArc(part->grandparent_predicate(), + part->predicate(), + part->grandparent_sense()); + int r2 = semantic_parts->FindArc(part->predicate(), + part->argument(), + part->sense()); + CHECK_GE(r1, 0); + CHECK_GE(r2, 0); + vector local_variables; + local_variables.push_back(variables[r1 - offset_arcs + + offset_arc_variables]); + local_variables.push_back(variables[r2 - offset_arcs + + offset_arc_variables]); + + factor_graph->CreateFactorPAIR(local_variables, + scores[offset_grandparents + r]); + // TODO: set these global indices at the end after all variables/factors + // are created. + //factor->SetGlobalIndex(...); + additional_part_indices.push_back(offset_grandparents + r); + factor_part_indices_.push_back(offset_grandparents + r); + } + } + + ////////////////////////////////////////////////////////////////////// + // Build co-parent factors. + ////////////////////////////////////////////////////////////////////// + if (use_coparent_parts) { + for (int r = 0; r < num_coparents; ++r) { + SemanticPartCoparent *part = static_cast( + (*semantic_parts)[offset_coparents + r]); + int r1 = semantic_parts->FindArc(part->first_predicate(), + part->argument(), + part->first_sense()); + int r2 = semantic_parts->FindArc(part->second_predicate(), + part->argument(), + part->second_sense()); + CHECK_GE(r1, 0); + CHECK_GE(r2, 0); + vector local_variables; + local_variables.push_back(variables[r1 - offset_arcs + + offset_arc_variables]); + local_variables.push_back(variables[r2 - offset_arcs + + offset_arc_variables]); + + factor_graph->CreateFactorPAIR(local_variables, + scores[offset_coparents + r]); + // TODO: set these global indices at the end after all variables/factors + // are created. + //factor->SetGlobalIndex(...); + additional_part_indices.push_back(offset_coparents + r); + factor_part_indices_.push_back(offset_coparents + r); + } + } + + ////////////////////////////////////////////////////////////////////////////// + + CHECK_EQ(variables.size(), part_indices_.size()); + CHECK_EQ(factor_graph->GetNumFactors(), factor_part_indices_.size()); + + // Compute additional_part_indices_. + int offset = factor_graph->GetNumVariables(); + for (int i = 0; i < factor_graph->GetNumFactors(); ++i) { + offset += factor_graph->GetFactor(i)->GetAdditionalLogPotentials().size(); + } + CHECK_EQ(additional_part_indices.size(), + offset - factor_graph->GetNumVariables()); + // Concatenate part_indices and additional_part_indices. + part_indices_.insert(part_indices_.end(), + additional_part_indices.begin(), + additional_part_indices.end()); + + VLOG(2) << "Number of factors: " << factor_graph->GetNumFactors(); + VLOG(2) << "Number of variables: " << factor_graph->GetNumVariables(); + +#if 0 + LOG(INFO) << "Number of factors: " << factor_graph->GetNumFactors(); + LOG(INFO) << "Number of variables: " << factor_graph->GetNumVariables(); + LOG(INFO) << "Number of siblings: " << num_siblings; + LOG(INFO) << "part_indices_.size() = " << part_indices_.size(); + LOG(INFO) << "additional_part_indices.size() = " << additional_part_indices.size(); + LOG(INFO) << "factor_part_indices_.size() = " << factor_part_indices_.size(); +#endif + + vector recomputed_indices(part_indices_.size(), -1); + bool solved = false; + + //#define PRINT_GRAPH +#ifdef PRINT_GRAPH + ofstream stream; + stream.open("tmp.fg", ofstream::out | ofstream::app); + CHECK(stream.good()); + factor_graph->Print(stream); + stream << endl; + stream.flush(); + stream.clear(); + stream.close(); +#endif + + vector posteriors; + vector additional_posteriors; + double value_ref; + double *value = &value_ref; + + //factor_graph->SetMaxIterationsAD3(2000); + factor_graph->SetMaxIterationsAD3(500); + factor_graph->SetEtaAD3(0.05); + factor_graph->AdaptEtaAD3(true); + factor_graph->SetResidualThresholdAD3(1e-3); + //factor_graph->SetResidualThresholdAD3(1e-6); + // Run AD3. - timeval start, end; - gettimeofday(&start, NULL); - if (!solved) { - factor_graph->SolveLPMAPWithAD3(&posteriors, &additional_posteriors, value); - } - gettimeofday(&end, NULL); - double elapsed_time = diff_ms(end, start); - VLOG(2) << "Elapsed time (AD3) = " << elapsed_time - << " (" << sentence->size() << ") "; - - delete factor_graph; - - *value = 0.0; - predicted_output->assign(parts->size(), 0.0); - for (int i = 0; i < part_indices_.size(); ++i) { - int r = part_indices_[i]; - if (r < 0) continue; - if (i < posteriors.size()) { - (*predicted_output)[r] = posteriors[i]; - } else { - int j = i - posteriors.size(); - (*predicted_output)[r] = additional_posteriors[j]; -#if 0 - /// - CHECK_GE(r, offset_siblings); - CHECK_LT(j + 1, factor_graph->GetNumFactors()); - AD3::Factor *factor = factor_graph->GetFactor(j + 1); - CHECK_EQ(factor->Degree(), 2) << j + 1 << " " << factor->GetId(); - AD3::BinaryVariable *var1 = factor->GetVariable(0); - AD3::BinaryVariable *var2 = factor->GetVariable(1); - int i1 = var1->GetId(); - int i2 = var2->GetId(); - if (posteriors[i1] > 0.9 && posteriors[i2] > 0.9) { - CHECK_GE(additional_posteriors[j], 0.9); - } - int r1 = offset_arcs + i1 - offset_arc_variables; - int r2 = offset_arcs + i2 - offset_arc_variables; - CHECK_EQ(posteriors[i1], (*predicted_output)[r1]); - CHECK_EQ(posteriors[i2], (*predicted_output)[r2]); - SemanticPartArc* arc1 = static_cast((*semantic_parts)[r1]); - SemanticPartArc* arc2 = static_cast((*semantic_parts)[r2]); - SemanticPartSibling* part = static_cast((*semantic_parts)[r]); - CHECK_EQ(arc1->predicate(), part->predicate()); - CHECK_EQ(arc1->sense(), part->sense()); - CHECK_EQ(arc1->argument(), part->first_argument()); - CHECK_EQ(arc2->predicate(), part->predicate()); - CHECK_EQ(arc2->sense(), part->sense()); - CHECK_EQ(arc2->argument(), part->second_argument()); - - if (part->predicate() == 9) { - LOG(INFO) << "*** sibling " - << "[" << r << " " << r1 << " " << r2 << "] " - << part->predicate() << " " - << part->sense() << " " - << part->first_argument() << " " - << part->second_argument() << " = " - << (*predicted_output)[r] << " " - << (*predicted_output)[r1] << " " - << (*predicted_output)[r2]; - } - /// -#endif - } - *value += (*predicted_output)[r] * scores[r]; - } - -#if 0 - ////// - for (int r = 0; r < parts->size(); ++r) { - if (r >= offset_labeled_arcs && r < offset_labeled_arcs + num_labeled_arcs) { - (*predicted_output)[r] = 0.0; - } else { - CHECK_GE((*predicted_output)[r], -1e-12); - } + chronowrap::Chronometer chrono; + chrono.GetTime(); + if (!solved) { + factor_graph->SolveLPMAPWithAD3(&posteriors, &additional_posteriors, value); } - /////// - - delete factor_graph; -#endif - - VLOG(2) << "Solution value (AD3) = " << *value; -} + chrono.StopTime(); + double elapsed_time = chrono.GetElapsedTime(); + VLOG(2) << "Elapsed time (AD3) = " << elapsed_time << " sec." + << " (" << sentence->size() << ") "; + + delete factor_graph; + + *value = 0.0; + predicted_output->assign(parts->size(), 0.0); + for (int i = 0; i < part_indices_.size(); ++i) { + int r = part_indices_[i]; + if (r < 0) continue; + if (i < posteriors.size()) { + (*predicted_output)[r] = posteriors[i]; + } else { + int j = i - posteriors.size(); + (*predicted_output)[r] = additional_posteriors[j]; +#if 0 + /// + CHECK_GE(r, offset_siblings); + CHECK_LT(j + 1, factor_graph->GetNumFactors()); + AD3::Factor *factor = factor_graph->GetFactor(j + 1); + CHECK_EQ(factor->Degree(), 2) << j + 1 << " " << factor->GetId(); + AD3::BinaryVariable *var1 = factor->GetVariable(0); + AD3::BinaryVariable *var2 = factor->GetVariable(1); + int i1 = var1->GetId(); + int i2 = var2->GetId(); + if (posteriors[i1] > 0.9 && posteriors[i2] > 0.9) { + CHECK_GE(additional_posteriors[j], 0.9); + } + int r1 = offset_arcs + i1 - offset_arc_variables; + int r2 = offset_arcs + i2 - offset_arc_variables; + CHECK_EQ(posteriors[i1], (*predicted_output)[r1]); + CHECK_EQ(posteriors[i2], (*predicted_output)[r2]); + SemanticPartArc* arc1 = static_cast((*semantic_parts)[r1]); + SemanticPartArc* arc2 = static_cast((*semantic_parts)[r2]); + SemanticPartSibling* part = static_cast((*semantic_parts)[r]); + CHECK_EQ(arc1->predicate(), part->predicate()); + CHECK_EQ(arc1->sense(), part->sense()); + CHECK_EQ(arc1->argument(), part->first_argument()); + CHECK_EQ(arc2->predicate(), part->predicate()); + CHECK_EQ(arc2->sense(), part->sense()); + CHECK_EQ(arc2->argument(), part->second_argument()); + + if (part->predicate() == 9) { + LOG(INFO) << "*** sibling " + << "[" << r << " " << r1 << " " << r2 << "] " + << part->predicate() << " " + << part->sense() << " " + << part->first_argument() << " " + << part->second_argument() << " = " + << (*predicted_output)[r] << " " + << (*predicted_output)[r1] << " " + << (*predicted_output)[r2]; + } + /// +#endif + } + *value += (*predicted_output)[r] * scores[r]; + } + +#if 0 + ////// + for (int r = 0; r < parts->size(); ++r) { + if (r >= offset_labeled_arcs && r < offset_labeled_arcs + num_labeled_arcs) { + (*predicted_output)[r] = 0.0; + } else { + CHECK_GE((*predicted_output)[r], -1e-12); + } + } + /////// + + delete factor_graph; +#endif + + VLOG(2) << "Solution value (AD3) = " << *value; +} diff --git a/src/semantic_parser/SemanticDictionary.h b/src/semantic_parser/SemanticDictionary.h index 6e25bfd..f348f5f 100644 --- a/src/semantic_parser/SemanticDictionary.h +++ b/src/semantic_parser/SemanticDictionary.h @@ -1,351 +1,351 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEMANTICDICTIONARY_H_ -#define SEMANTICDICTIONARY_H_ - -#include "Dictionary.h" -#include "TokenDictionary.h" -#include "DependencyDictionary.h" -#include "SerializationUtils.h" -#include "SemanticPredicate.h" -#include "SemanticReader.h" - -class Pipe; - -enum SpecialPredicates { - PREDICATE_UNKNOWN = 0, - NUM_SPECIAL_PREDICATES -}; - -enum SpecialDependencyPaths { - PATH_UNKNOWN = 0, - NUM_SPECIAL_PATHS -}; - -class SemanticDictionary : public Dictionary { -public: - SemanticDictionary() { token_dictionary_ = NULL; } - SemanticDictionary(Pipe* pipe) : pipe_(pipe) {} - virtual ~SemanticDictionary() { - Clear(); - } - - void CreatePredicateRoleDictionaries(SemanticReader *reader); - - void Clear() { - // Don't clear token_dictionary, since this class does not own it. - for (int i = 0; i < lemma_predicates_.size(); ++i) { - for (int j = 0; j < lemma_predicates_[i].size(); ++j) { - delete lemma_predicates_[i][j]; - } - lemma_predicates_[i].clear(); - } - lemma_predicates_.clear(); - predicate_alphabet_.clear(); - role_alphabet_.clear(); - relation_path_alphabet_.clear(); - pos_path_alphabet_.clear(); - existing_roles_.clear(); - existing_roles_with_relation_path_.clear(); - maximum_left_distances_.clear(); - maximum_right_distances_.clear(); - } - - void BuildPredicateRoleNames() { - predicate_alphabet_.BuildNames(); - role_alphabet_.BuildNames(); - relation_path_alphabet_.BuildNames(); - pos_path_alphabet_.BuildNames(); - } - - const vector &GetLemmaPredicates(int lemma) const { - return lemma_predicates_[lemma]; - } - - const string &GetPredicateName(int predicate) const { - return predicate_alphabet_.GetName(predicate); - } - - const string &GetRoleName(int role) const { - return role_alphabet_.GetName(role); - } - - int GetRoleBigramLabel(int first_role, int second_role) const { - CHECK_GE(first_role, 0); - CHECK_GE(second_role, 0); - return first_role * role_alphabet_.size() + second_role; - } - - int GetNumRoleBigramLabels() const { - return role_alphabet_.size()*role_alphabet_.size(); - } - - int GetNumRoles() const { - return role_alphabet_.size(); - } - - bool IsFrequentRolePair(int first_role, int second_role) const { - int label_bigram = GetRoleBigramLabel(first_role, second_role); - return frequent_role_pairs_.find(label_bigram) != - frequent_role_pairs_.end(); - } - - bool IsRoleDeterministic(int role) const { - return deterministic_roles_[role]; - } - - const string &GetRelationPathName(int path) const { - return relation_path_alphabet_.GetName(path); - } - - const string &GetPosPathName(int path) const { - return pos_path_alphabet_.GetName(path); - } - - // TODO(atm): check if we should allow/stop growth of the other dictionaries - // as well. - void AllowGrowth() { token_dictionary_->AllowGrowth(); } - void StopGrowth() { token_dictionary_->StopGrowth(); } - - void Save(FILE *fs) { - if (0 > predicate_alphabet_.Save(fs)) CHECK(false); - if (0 > role_alphabet_.Save(fs)) CHECK(false); - if (0 > relation_path_alphabet_.Save(fs)) CHECK(false); - if (0 > pos_path_alphabet_.Save(fs)) CHECK(false); - bool success; - int length = lemma_predicates_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int i = 0; i < lemma_predicates_.size(); ++i) { - length = lemma_predicates_[i].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < lemma_predicates_[i].size(); ++j) { - lemma_predicates_[i][j]->Save(fs); - } - } - CHECK_EQ(deterministic_roles_.size(), GetNumRoles()); - length = deterministic_roles_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int i = 0; i < deterministic_roles_.size(); ++i) { - bool deterministic = deterministic_roles_[i]; - success = WriteBool(fs, deterministic); - CHECK(success); - } - length = existing_roles_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int i = 0; i < existing_roles_.size(); ++i) { - length = existing_roles_[i].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < existing_roles_[i].size(); ++j) { - length = existing_roles_[i][j].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int k = 0; k < existing_roles_[i][j].size(); ++k) { - int label = existing_roles_[i][j][k]; - success = WriteInteger(fs, label); - CHECK(success); - } - int distance; - distance = maximum_left_distances_[i][j]; - success = WriteInteger(fs, distance); - CHECK(success); - distance = maximum_right_distances_[i][j]; - success = WriteInteger(fs, distance); - CHECK(success); - } - } - length = existing_roles_with_relation_path_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int i = 0; i < existing_roles_with_relation_path_.size(); ++i) { - length = existing_roles_with_relation_path_[i].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int k = 0; k < existing_roles_with_relation_path_[i].size(); ++k) { - int label = existing_roles_with_relation_path_[i][k]; - success = WriteInteger(fs, label); - CHECK(success); - } - } - length = frequent_role_pairs_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (set::iterator it = frequent_role_pairs_.begin(); - it != frequent_role_pairs_.end(); - ++it) { - int label_bigram = *it; - success = WriteInteger(fs, label_bigram); - CHECK(success); - } - } - - void Load(FILE *fs) { - if (0 > predicate_alphabet_.Load(fs)) CHECK(false); - if (0 > role_alphabet_.Load(fs)) CHECK(false); - if (0 > relation_path_alphabet_.Load(fs)) CHECK(false); - if (0 > pos_path_alphabet_.Load(fs)) CHECK(false); - bool success; - int length; - success = ReadInteger(fs, &length); - CHECK(success); - lemma_predicates_.resize(length); - for (int i = 0; i < lemma_predicates_.size(); ++i) { - success = ReadInteger(fs, &length); - CHECK(success); - lemma_predicates_[i].resize(length); - for (int j = 0; j < lemma_predicates_[i].size(); ++j) { - lemma_predicates_[i][j] = new SemanticPredicate(); - lemma_predicates_[i][j]->Load(fs); - } - } - success = ReadInteger(fs, &length); - CHECK(success); - deterministic_roles_.resize(length); - CHECK_EQ(deterministic_roles_.size(), GetNumRoles()); - for (int i = 0; i < deterministic_roles_.size(); ++i) { - bool deterministic; - success = ReadBool(fs, &deterministic); - CHECK(success); - deterministic_roles_[i] = deterministic; - } - success = ReadInteger(fs, &length); - CHECK(success); - existing_roles_.resize(length); - maximum_left_distances_.resize(length); - maximum_right_distances_.resize(length); - for (int i = 0; i < existing_roles_.size(); ++i) { - success = ReadInteger(fs, &length); - CHECK(success); - existing_roles_[i].resize(length); - maximum_left_distances_[i].resize(length); - maximum_right_distances_[i].resize(length); - for (int j = 0; j < existing_roles_[i].size(); ++j) { - success = ReadInteger(fs, &length); - CHECK(success); - existing_roles_[i][j].resize(length); - for (int k = 0; k < existing_roles_[i][j].size(); ++k) { - int label; - success = ReadInteger(fs, &label); - CHECK(success); - existing_roles_[i][j][k] = label; - } - int distance; - success = ReadInteger(fs, &distance); - CHECK(success); - maximum_left_distances_[i][j] = distance; - success = ReadInteger(fs, &distance); - CHECK(success); - maximum_right_distances_[i][j] = distance; - } - } - success = ReadInteger(fs, &length); - CHECK(success); - existing_roles_with_relation_path_.resize(length); - for (int i = 0; i < existing_roles_with_relation_path_.size(); ++i) { - success = ReadInteger(fs, &length); - CHECK(success); - existing_roles_with_relation_path_[i].resize(length); - for (int k = 0; k < existing_roles_with_relation_path_[i].size(); ++k) { - int label; - success = ReadInteger(fs, &label); - CHECK(success); - existing_roles_with_relation_path_[i][k] = label; - } - } - success = ReadInteger(fs, &length); - CHECK(success); - frequent_role_pairs_.clear(); - for (int k = 0; k < length; ++k) { - int label_bigram; - success = ReadInteger(fs, &label_bigram); - CHECK(success); - frequent_role_pairs_.insert(label_bigram); - } - BuildPredicateRoleNames(); - } - - Pipe *GetPipe() const { return pipe_; } - - TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } - void SetTokenDictionary(TokenDictionary *token_dictionary) { - token_dictionary_ = token_dictionary; - //CHECK(token_dictionary_ == NULL); - } - - DependencyDictionary *GetDependencyDictionary() const { - return dependency_dictionary_; - } - void SetDependencyDictionary(DependencyDictionary *dependency_dictionary) { - dependency_dictionary_ = dependency_dictionary; - //CHECK(token_dictionary_ == NULL); - } - - const vector &GetExistingRoles(int predicate_pos_id, int argument_pos_id) { - return existing_roles_[predicate_pos_id][argument_pos_id]; - } - - const vector &GetExistingRolesWithRelationPath(int relation_path_id) { - return existing_roles_with_relation_path_[relation_path_id]; - } - - int GetMaximumLeftDistance(int predicate_pos_id, int argument_pos_id) { - return maximum_left_distances_[predicate_pos_id][argument_pos_id]; - } - - int GetMaximumRightDistance(int predicate_pos_id, int argument_pos_id) { - return maximum_right_distances_[predicate_pos_id][argument_pos_id]; - } - - const Alphabet &GetPredicateAlphabet() const { return predicate_alphabet_; }; - const Alphabet &GetRoleAlphabet() const { return role_alphabet_; }; - const Alphabet &GetRelationPathAlphabet() const { - return relation_path_alphabet_; - }; - const Alphabet &GetPosPathAlphabet() const { return pos_path_alphabet_; }; - - void ComputeDependencyPath(SemanticInstance *instance, - int p, int a, - string *relation_path, - string *pos_path) const; - -protected: - int FindLowestCommonAncestor(const vector& heads, int p, int a) const; - -protected: - Pipe *pipe_; - TokenDictionary *token_dictionary_; - DependencyDictionary *dependency_dictionary_; - vector > lemma_predicates_; - Alphabet predicate_alphabet_; - Alphabet role_alphabet_; - Alphabet relation_path_alphabet_; - Alphabet pos_path_alphabet_; - vector deterministic_roles_; - vector > > existing_roles_; - vector > existing_roles_with_relation_path_; - vector > maximum_left_distances_; - vector > maximum_right_distances_; - set frequent_role_pairs_; -}; - -#endif /* SEMANTICDICTIONARY_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEMANTICDICTIONARY_H_ +#define SEMANTICDICTIONARY_H_ + +#include "Dictionary.h" +#include "TokenDictionary.h" +#include "DependencyDictionary.h" +#include "SerializationUtils.h" +#include "SemanticPredicate.h" +#include "SemanticReader.h" + +class Pipe; + +enum SpecialPredicates { + PREDICATE_UNKNOWN = 0, + NUM_SPECIAL_PREDICATES +}; + +enum SpecialDependencyPaths { + PATH_UNKNOWN = 0, + NUM_SPECIAL_PATHS +}; + +class SemanticDictionary : public Dictionary { +public: + SemanticDictionary() { token_dictionary_ = NULL; } + SemanticDictionary(Pipe* pipe) : pipe_(pipe) {} + virtual ~SemanticDictionary() { + Clear(); + } + + void CreatePredicateRoleDictionaries(SemanticReader *reader); + + void Clear() { + // Don't clear token_dictionary, since this class does not own it. + for (int i = 0; i < lemma_predicates_.size(); ++i) { + for (int j = 0; j < lemma_predicates_[i].size(); ++j) { + delete lemma_predicates_[i][j]; + } + lemma_predicates_[i].clear(); + } + lemma_predicates_.clear(); + predicate_alphabet_.clear(); + role_alphabet_.clear(); + relation_path_alphabet_.clear(); + pos_path_alphabet_.clear(); + existing_roles_.clear(); + existing_roles_with_relation_path_.clear(); + maximum_left_distances_.clear(); + maximum_right_distances_.clear(); + } + + void BuildPredicateRoleNames() { + predicate_alphabet_.BuildNames(); + role_alphabet_.BuildNames(); + relation_path_alphabet_.BuildNames(); + pos_path_alphabet_.BuildNames(); + } + + const vector &GetLemmaPredicates(int lemma) const { + return lemma_predicates_[lemma]; + } + + const string &GetPredicateName(int predicate) const { + return predicate_alphabet_.GetName(predicate); + } + + const string &GetRoleName(int role) const { + return role_alphabet_.GetName(role); + } + + int GetRoleBigramLabel(int first_role, int second_role) const { + CHECK_GE(first_role, 0); + CHECK_GE(second_role, 0); + return first_role * role_alphabet_.size() + second_role; + } + + int GetNumRoleBigramLabels() const { + return role_alphabet_.size()*role_alphabet_.size(); + } + + int GetNumRoles() const { + return role_alphabet_.size(); + } + + bool IsFrequentRolePair(int first_role, int second_role) const { + int label_bigram = GetRoleBigramLabel(first_role, second_role); + return frequent_role_pairs_.find(label_bigram) != + frequent_role_pairs_.end(); + } + + bool IsRoleDeterministic(int role) const { + return deterministic_roles_[role]; + } + + const string &GetRelationPathName(int path) const { + return relation_path_alphabet_.GetName(path); + } + + const string &GetPosPathName(int path) const { + return pos_path_alphabet_.GetName(path); + } + + // TODO(atm): check if we should allow/stop growth of the other dictionaries + // as well. + void AllowGrowth() { token_dictionary_->AllowGrowth(); } + void StopGrowth() { token_dictionary_->StopGrowth(); } + + void Save(FILE *fs) { + if (0 > predicate_alphabet_.Save(fs)) CHECK(false); + if (0 > role_alphabet_.Save(fs)) CHECK(false); + if (0 > relation_path_alphabet_.Save(fs)) CHECK(false); + if (0 > pos_path_alphabet_.Save(fs)) CHECK(false); + bool success; + int length = (int)lemma_predicates_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int i = 0; i < lemma_predicates_.size(); ++i) { + length = (int)lemma_predicates_[i].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < lemma_predicates_[i].size(); ++j) { + lemma_predicates_[i][j]->Save(fs); + } + } + CHECK_EQ(deterministic_roles_.size(), GetNumRoles()); + length = (int)deterministic_roles_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int i = 0; i < deterministic_roles_.size(); ++i) { + bool deterministic = deterministic_roles_[i]; + success = WriteBool(fs, deterministic); + CHECK(success); + } + length = (int)existing_roles_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int i = 0; i < existing_roles_.size(); ++i) { + length = (int)existing_roles_[i].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < existing_roles_[i].size(); ++j) { + length = (int)existing_roles_[i][j].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int k = 0; k < existing_roles_[i][j].size(); ++k) { + int label = existing_roles_[i][j][k]; + success = WriteInteger(fs, label); + CHECK(success); + } + int distance; + distance = maximum_left_distances_[i][j]; + success = WriteInteger(fs, distance); + CHECK(success); + distance = maximum_right_distances_[i][j]; + success = WriteInteger(fs, distance); + CHECK(success); + } + } + length = (int)existing_roles_with_relation_path_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int i = 0; i < existing_roles_with_relation_path_.size(); ++i) { + length = (int)existing_roles_with_relation_path_[i].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int k = 0; k < existing_roles_with_relation_path_[i].size(); ++k) { + int label = existing_roles_with_relation_path_[i][k]; + success = WriteInteger(fs, label); + CHECK(success); + } + } + length = (int)frequent_role_pairs_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (set::iterator it = frequent_role_pairs_.begin(); + it != frequent_role_pairs_.end(); + ++it) { + int label_bigram = *it; + success = WriteInteger(fs, label_bigram); + CHECK(success); + } + } + + void Load(FILE *fs) { + if (0 > predicate_alphabet_.Load(fs)) CHECK(false); + if (0 > role_alphabet_.Load(fs)) CHECK(false); + if (0 > relation_path_alphabet_.Load(fs)) CHECK(false); + if (0 > pos_path_alphabet_.Load(fs)) CHECK(false); + bool success; + int length; + success = ReadInteger(fs, &length); + CHECK(success); + lemma_predicates_.resize(length); + for (int i = 0; i < lemma_predicates_.size(); ++i) { + success = ReadInteger(fs, &length); + CHECK(success); + lemma_predicates_[i].resize(length); + for (int j = 0; j < lemma_predicates_[i].size(); ++j) { + lemma_predicates_[i][j] = new SemanticPredicate(); + lemma_predicates_[i][j]->Load(fs); + } + } + success = ReadInteger(fs, &length); + CHECK(success); + deterministic_roles_.resize(length); + CHECK_EQ(deterministic_roles_.size(), GetNumRoles()); + for (int i = 0; i < deterministic_roles_.size(); ++i) { + bool deterministic; + success = ReadBool(fs, &deterministic); + CHECK(success); + deterministic_roles_[i] = deterministic; + } + success = ReadInteger(fs, &length); + CHECK(success); + existing_roles_.resize(length); + maximum_left_distances_.resize(length); + maximum_right_distances_.resize(length); + for (int i = 0; i < existing_roles_.size(); ++i) { + success = ReadInteger(fs, &length); + CHECK(success); + existing_roles_[i].resize(length); + maximum_left_distances_[i].resize(length); + maximum_right_distances_[i].resize(length); + for (int j = 0; j < existing_roles_[i].size(); ++j) { + success = ReadInteger(fs, &length); + CHECK(success); + existing_roles_[i][j].resize(length); + for (int k = 0; k < existing_roles_[i][j].size(); ++k) { + int label; + success = ReadInteger(fs, &label); + CHECK(success); + existing_roles_[i][j][k] = label; + } + int distance; + success = ReadInteger(fs, &distance); + CHECK(success); + maximum_left_distances_[i][j] = distance; + success = ReadInteger(fs, &distance); + CHECK(success); + maximum_right_distances_[i][j] = distance; + } + } + success = ReadInteger(fs, &length); + CHECK(success); + existing_roles_with_relation_path_.resize(length); + for (int i = 0; i < existing_roles_with_relation_path_.size(); ++i) { + success = ReadInteger(fs, &length); + CHECK(success); + existing_roles_with_relation_path_[i].resize(length); + for (int k = 0; k < existing_roles_with_relation_path_[i].size(); ++k) { + int label; + success = ReadInteger(fs, &label); + CHECK(success); + existing_roles_with_relation_path_[i][k] = label; + } + } + success = ReadInteger(fs, &length); + CHECK(success); + frequent_role_pairs_.clear(); + for (int k = 0; k < length; ++k) { + int label_bigram; + success = ReadInteger(fs, &label_bigram); + CHECK(success); + frequent_role_pairs_.insert(label_bigram); + } + BuildPredicateRoleNames(); + } + + Pipe *GetPipe() const { return pipe_; } + + TokenDictionary *GetTokenDictionary() const { return token_dictionary_; } + void SetTokenDictionary(TokenDictionary *token_dictionary) { + token_dictionary_ = token_dictionary; + //CHECK(token_dictionary_ == NULL); + } + + DependencyDictionary *GetDependencyDictionary() const { + return dependency_dictionary_; + } + void SetDependencyDictionary(DependencyDictionary *dependency_dictionary) { + dependency_dictionary_ = dependency_dictionary; + //CHECK(token_dictionary_ == NULL); + } + + const vector &GetExistingRoles(int predicate_pos_id, int argument_pos_id) { + return existing_roles_[predicate_pos_id][argument_pos_id]; + } + + const vector &GetExistingRolesWithRelationPath(int relation_path_id) { + return existing_roles_with_relation_path_[relation_path_id]; + } + + int GetMaximumLeftDistance(int predicate_pos_id, int argument_pos_id) { + return maximum_left_distances_[predicate_pos_id][argument_pos_id]; + } + + int GetMaximumRightDistance(int predicate_pos_id, int argument_pos_id) { + return maximum_right_distances_[predicate_pos_id][argument_pos_id]; + } + + const Alphabet &GetPredicateAlphabet() const { return predicate_alphabet_; }; + const Alphabet &GetRoleAlphabet() const { return role_alphabet_; }; + const Alphabet &GetRelationPathAlphabet() const { + return relation_path_alphabet_; + }; + const Alphabet &GetPosPathAlphabet() const { return pos_path_alphabet_; }; + + void ComputeDependencyPath(SemanticInstance *instance, + int p, int a, + string *relation_path, + string *pos_path) const; + +protected: + int FindLowestCommonAncestor(const vector& heads, int p, int a) const; + +protected: + Pipe *pipe_; + TokenDictionary *token_dictionary_; + DependencyDictionary *dependency_dictionary_; + vector > lemma_predicates_; + Alphabet predicate_alphabet_; + Alphabet role_alphabet_; + Alphabet relation_path_alphabet_; + Alphabet pos_path_alphabet_; + vector deterministic_roles_; + vector > > existing_roles_; + vector > existing_roles_with_relation_path_; + vector > maximum_left_distances_; + vector > maximum_right_distances_; + set frequent_role_pairs_; +}; + +#endif /* SEMANTICDICTIONARY_H_ */ diff --git a/src/semantic_parser/SemanticFeatures.h b/src/semantic_parser/SemanticFeatures.h index 79c2e74..b57be13 100644 --- a/src/semantic_parser/SemanticFeatures.h +++ b/src/semantic_parser/SemanticFeatures.h @@ -1,222 +1,222 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEMANTICFEATURES_H_ -#define SEMANTICFEATURES_H_ - -#include "Features.h" -#include "SemanticInstanceNumeric.h" -#include "FeatureEncoder.h" - -class SemanticOptions; - -class SemanticFeatures : public Features { -public: - SemanticFeatures() {}; - SemanticFeatures(Pipe* pipe) { pipe_ = pipe; } - virtual ~SemanticFeatures() { Clear(); } - -public: - void Clear() { - CHECK_EQ(input_features_.size(), input_labeled_features_.size()); - for (int r = 0; r < input_features_.size(); ++r) { - if (input_features_[r]) { - input_features_[r]->clear(); - delete input_features_[r]; - input_features_[r] = NULL; - } - if (input_labeled_features_[r]) { - input_labeled_features_[r]->clear(); - delete input_labeled_features_[r]; - input_labeled_features_[r] = NULL; - } - } - input_features_.clear(); - input_labeled_features_.clear(); - } - - void Initialize(Instance *instance, Parts *parts) { - Clear(); - input_features_.resize(parts->size(), static_cast(NULL)); - input_labeled_features_.resize(parts->size(), - static_cast(NULL)); - } - - int GetNumPartFeatures(int r) const { - return (NULL == input_features_[r]) ? 0 : input_features_[r]->size(); - }; - - int GetNumLabeledPartFeatures(int r) const { - return (NULL == input_labeled_features_[r]) ? - 0 : input_labeled_features_[r]->size(); - }; - - int GetPartFeature(int r, int j) const { - return (*input_features_[r])[j]; - } - - int GetLabeledPartFeature(int r, int j) const { - return (*input_labeled_features_[r])[j]; - } - - const BinaryFeatures &GetPartFeatures(int r) const { - CHECK(input_features_[r] != NULL); - return *(input_features_[r]); - }; - - const BinaryFeatures &GetLabeledPartFeatures(int r) const { - CHECK(input_labeled_features_[r] != NULL); - return *(input_labeled_features_[r]); - }; - - BinaryFeatures *GetMutablePartFeatures(int r) const { - return input_features_[r]; - }; - - BinaryFeatures *GetMutableLabeledPartFeatures(int r) const { - return input_labeled_features_[r]; - }; - -public: - void AddPredicateFeatures(SemanticInstanceNumeric *sentence, - int r, - int predicate, - int predicate_id); - - void AddArcFeatures(SemanticInstanceNumeric *sentence, - int r, - int predicate, - int argument, - int predicate_id); - - void AddLabeledArcFeatures(SemanticInstanceNumeric *sentence, - int r, - int predicate, - int argument, - int predicate_id); - - void AddArbitrarySiblingFeatures(SemanticInstanceNumeric* sentence, - int r, - int predicate, - int sense, - int first_argument, - int second_argument); - - void AddArbitraryLabeledSiblingFeatures(SemanticInstanceNumeric* sentence, - int r, - int predicate, - int sense, - int first_argument, - int second_argument); - - void AddConsecutiveSiblingFeatures(SemanticInstanceNumeric* sentence, - int r, - int predicate, - int sense, - int first_argument, - int second_argument); - - void AddGrandparentFeatures(SemanticInstanceNumeric* sentence, - int r, - int grandparent_predicate, - int grandparent_sense, - int predicate, - int sense, - int argument); - - void AddCoparentFeatures(SemanticInstanceNumeric* sentence, - int r, - int first_predicate, - int first_sense, - int second_predicate, - int second_sense, - int argument); - - void AddConsecutiveCoparentFeatures(SemanticInstanceNumeric* sentence, - int r, - int first_predicate, - int first_sense, - int second_predicate, - int second_sense, - int argument); - - void AddSecondOrderFeatures(SemanticInstanceNumeric* sentence, - int r, - int first_predicate, - int first_sense, - int second_predicate, - int second_sense, - int argument, - bool coparents, - bool consecutive); - -#if 0 - void AddGrandSiblingFeatures(SemanticInstanceNumeric* sentence, - int r, - int grandparent, - int head, - int modifier, - int sibling); - - void AddTriSiblingFeatures(SemanticInstanceNumeric* sentence, - int r, - int head, - int modifier, - int sibling, - int other_sibling); -#endif - -protected: - void AddPredicateFeatures(SemanticInstanceNumeric *sentence, - bool labeled, - uint8_t feature_type, - int r, - int predicate, - int predicate_id); - - void AddArcFeatures(SemanticInstanceNumeric *sentence, - bool labeled, - int r, - int predicate, - int argument, - int predicate_id); - - void AddSiblingFeatures(SemanticInstanceNumeric* sentence, - bool labeled, - int r, - int predicate, - int sense, - int first_argument, - int second_argument, - bool consecutive); - - void AddFeature(uint64_t fkey, BinaryFeatures* features) { - features->push_back(fkey); - } - -protected: - // Vector of input features. - vector input_features_; - // Vector of input features to be conjoined with a label to produce a - // "labeled" feature. - vector input_labeled_features_; - // Encoder that converts features into a codeword. - FeatureEncoder encoder_; -}; - -#endif /* SEMANTICFEATURES_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEMANTICFEATURES_H_ +#define SEMANTICFEATURES_H_ + +#include "Features.h" +#include "SemanticInstanceNumeric.h" +#include "FeatureEncoder.h" + +class SemanticOptions; + +class SemanticFeatures : public Features { +public: + SemanticFeatures() {}; + SemanticFeatures(Pipe* pipe) { pipe_ = pipe; } + virtual ~SemanticFeatures() { Clear(); } + +public: + void Clear() { + CHECK_EQ(input_features_.size(), input_labeled_features_.size()); + for (int r = 0; r < input_features_.size(); ++r) { + if (input_features_[r]) { + input_features_[r]->clear(); + delete input_features_[r]; + input_features_[r] = NULL; + } + if (input_labeled_features_[r]) { + input_labeled_features_[r]->clear(); + delete input_labeled_features_[r]; + input_labeled_features_[r] = NULL; + } + } + input_features_.clear(); + input_labeled_features_.clear(); + } + + void Initialize(Instance *instance, Parts *parts) { + Clear(); + input_features_.resize(parts->size(), static_cast(NULL)); + input_labeled_features_.resize(parts->size(), + static_cast(NULL)); + } + + int GetNumPartFeatures(int r) const { + return (NULL == input_features_[r]) ? 0 : (int)(input_features_[r]->size()); + }; + + int GetNumLabeledPartFeatures(int r) const { + return (NULL == input_labeled_features_[r]) ? + 0 : (int)(input_labeled_features_[r]->size()); + }; + + int GetPartFeature(int r, int j) const { + return (*input_features_[r])[j]; + } + + int GetLabeledPartFeature(int r, int j) const { + return (*input_labeled_features_[r])[j]; + } + + const BinaryFeatures &GetPartFeatures(int r) const { + CHECK(input_features_[r] != NULL); + return *(input_features_[r]); + }; + + const BinaryFeatures &GetLabeledPartFeatures(int r) const { + CHECK(input_labeled_features_[r] != NULL); + return *(input_labeled_features_[r]); + }; + + BinaryFeatures *GetMutablePartFeatures(int r) const { + return input_features_[r]; + }; + + BinaryFeatures *GetMutableLabeledPartFeatures(int r) const { + return input_labeled_features_[r]; + }; + +public: + void AddPredicateFeatures(SemanticInstanceNumeric *sentence, + int r, + int predicate, + int predicate_id); + + void AddArcFeatures(SemanticInstanceNumeric *sentence, + int r, + int predicate, + int argument, + int predicate_id); + + void AddLabeledArcFeatures(SemanticInstanceNumeric *sentence, + int r, + int predicate, + int argument, + int predicate_id); + + void AddArbitrarySiblingFeatures(SemanticInstanceNumeric* sentence, + int r, + int predicate, + int sense, + int first_argument, + int second_argument); + + void AddArbitraryLabeledSiblingFeatures(SemanticInstanceNumeric* sentence, + int r, + int predicate, + int sense, + int first_argument, + int second_argument); + + void AddConsecutiveSiblingFeatures(SemanticInstanceNumeric* sentence, + int r, + int predicate, + int sense, + int first_argument, + int second_argument); + + void AddGrandparentFeatures(SemanticInstanceNumeric* sentence, + int r, + int grandparent_predicate, + int grandparent_sense, + int predicate, + int sense, + int argument); + + void AddCoparentFeatures(SemanticInstanceNumeric* sentence, + int r, + int first_predicate, + int first_sense, + int second_predicate, + int second_sense, + int argument); + + void AddConsecutiveCoparentFeatures(SemanticInstanceNumeric* sentence, + int r, + int first_predicate, + int first_sense, + int second_predicate, + int second_sense, + int argument); + + void AddSecondOrderFeatures(SemanticInstanceNumeric* sentence, + int r, + int first_predicate, + int first_sense, + int second_predicate, + int second_sense, + int argument, + bool coparents, + bool consecutive); + +#if 0 + void AddGrandSiblingFeatures(SemanticInstanceNumeric* sentence, + int r, + int grandparent, + int head, + int modifier, + int sibling); + + void AddTriSiblingFeatures(SemanticInstanceNumeric* sentence, + int r, + int head, + int modifier, + int sibling, + int other_sibling); +#endif + +protected: + void AddPredicateFeatures(SemanticInstanceNumeric *sentence, + bool labeled, + uint8_t feature_type, + int r, + int predicate, + int predicate_id); + + void AddArcFeatures(SemanticInstanceNumeric *sentence, + bool labeled, + int r, + int predicate, + int argument, + int predicate_id); + + void AddSiblingFeatures(SemanticInstanceNumeric* sentence, + bool labeled, + int r, + int predicate, + int sense, + int first_argument, + int second_argument, + bool consecutive); + + void AddFeature(uint64_t fkey, BinaryFeatures* features) { + features->push_back(fkey); + } + +protected: + // Vector of input features. + vector input_features_; + // Vector of input features to be conjoined with a label to produce a + // "labeled" feature. + vector input_labeled_features_; + // Encoder that converts features into a codeword. + FeatureEncoder encoder_; +}; + +#endif /* SEMANTICFEATURES_H_ */ diff --git a/src/semantic_parser/SemanticInstance.h b/src/semantic_parser/SemanticInstance.h index d4c69c1..36ddb56 100644 --- a/src/semantic_parser/SemanticInstance.h +++ b/src/semantic_parser/SemanticInstance.h @@ -1,96 +1,96 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEMANTICINSTANCE_H_ -#define SEMANTICINSTANCE_H_ - -#include -#include -#include "DependencyInstance.h" -#include - -class SemanticInstance : public DependencyInstance { -public: - SemanticInstance() {}; - virtual ~SemanticInstance() {}; - - Instance* Copy() { - SemanticInstance* instance = new SemanticInstance(); - instance->Initialize(name_, forms_, lemmas_, cpostags_, postags_, - feats_, deprels_, heads_, - predicate_names_, predicate_indices_, - argument_roles_, argument_indices_); - return static_cast(instance); - } - - void Initialize(const string &name, - const vector &forms, - const vector &lemmas, - const vector &cpos, - const vector &pos, - const vector > &feats, - const vector &deprels, - const vector &heads, - const vector &predicate_names, - const vector &predicate_indices, - const vector > &argument_roles, - const vector > &argument_indices); - - const string &GetName() { return name_; } - int GetNumPredicates() { return predicate_names_.size(); } - const string &GetPredicateName(int k) { return predicate_names_[k]; } - int GetPredicateIndex(int k) { return predicate_indices_[k]; } - int GetNumArgumentsPredicate(int k) { return argument_roles_[k].size(); } - const string &GetArgumentRole(int k, int l) { return argument_roles_[k][l]; } - int GetArgumentIndex(int k, int l) { return argument_indices_[k][l]; } - - void ClearPredicates() { - predicate_names_.clear(); - predicate_indices_.clear(); - for (int p = 0; p < argument_roles_.size(); ++p) { - argument_indices_[p].clear(); - argument_roles_[p].clear(); - } - argument_indices_.clear(); - argument_roles_.clear(); - } - - void AddPredicate(const string& predicate_name, - int predicate_index, - const vector &argument_roles, - const vector &argument_indices) { - predicate_names_.push_back(predicate_name); - predicate_indices_.push_back(predicate_index); - argument_roles_.push_back(argument_roles); - argument_indices_.push_back(argument_indices); - } - -protected: - // Name of the sentence (e.g. "#2000001"). - string name_; - // Names of the predicates (e.g. "take.01"). - vector predicate_names_; - // Positions of each predicate in the sentence. - vector predicate_indices_; - // Labels of each predicate's arguments (semantic roles). - vector > argument_roles_; - // Positions of each predicate's arguments. - vector > argument_indices_; -}; - -#endif /* SEMANTICINSTANCE_H_*/ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEMANTICINSTANCE_H_ +#define SEMANTICINSTANCE_H_ + +#include +#include +#include "DependencyInstance.h" +#include + +class SemanticInstance : public DependencyInstance { +public: + SemanticInstance() {}; + virtual ~SemanticInstance() {}; + + Instance* Copy() { + SemanticInstance* instance = new SemanticInstance(); + instance->Initialize(name_, forms_, lemmas_, cpostags_, postags_, + feats_, deprels_, heads_, + predicate_names_, predicate_indices_, + argument_roles_, argument_indices_); + return static_cast(instance); + } + + void Initialize(const string &name, + const vector &forms, + const vector &lemmas, + const vector &cpos, + const vector &pos, + const vector > &feats, + const vector &deprels, + const vector &heads, + const vector &predicate_names, + const vector &predicate_indices, + const vector > &argument_roles, + const vector > &argument_indices); + + const string &GetName() { return name_; } + int GetNumPredicates() { return (int) predicate_names_.size(); } + const string &GetPredicateName(int k) { return predicate_names_[k]; } + int GetPredicateIndex(int k) { return predicate_indices_[k]; } + int GetNumArgumentsPredicate(int k) { return (int) argument_roles_[k].size(); } + const string &GetArgumentRole(int k, int l) { return argument_roles_[k][l]; } + int GetArgumentIndex(int k, int l) { return argument_indices_[k][l]; } + + void ClearPredicates() { + predicate_names_.clear(); + predicate_indices_.clear(); + for (int p = 0; p < argument_roles_.size(); ++p) { + argument_indices_[p].clear(); + argument_roles_[p].clear(); + } + argument_indices_.clear(); + argument_roles_.clear(); + } + + void AddPredicate(const string& predicate_name, + int predicate_index, + const vector &argument_roles, + const vector &argument_indices) { + predicate_names_.push_back(predicate_name); + predicate_indices_.push_back(predicate_index); + argument_roles_.push_back(argument_roles); + argument_indices_.push_back(argument_indices); + } + +protected: + // Name of the sentence (e.g. "#2000001"). + string name_; + // Names of the predicates (e.g. "take.01"). + vector predicate_names_; + // Positions of each predicate in the sentence. + vector predicate_indices_; + // Labels of each predicate's arguments (semantic roles). + vector > argument_roles_; + // Positions of each predicate's arguments. + vector > argument_indices_; +}; + +#endif /* SEMANTICINSTANCE_H_*/ diff --git a/src/semantic_parser/SemanticInstanceNumeric.h b/src/semantic_parser/SemanticInstanceNumeric.h index 46be10d..4db148a 100644 --- a/src/semantic_parser/SemanticInstanceNumeric.h +++ b/src/semantic_parser/SemanticInstanceNumeric.h @@ -1,150 +1,150 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEMANTICINSTANCENUMERIC_H_ -#define SEMANTICINSTANCENUMERIC_H_ - -#include -#include -#include "Dictionary.h" -#include "DependencyInstanceNumeric.h" -#include "SemanticInstance.h" -#include "SemanticDictionary.h" - -using namespace std; - -class SemanticInstanceNumeric : public DependencyInstanceNumeric { -public: - SemanticInstanceNumeric() {}; - virtual ~SemanticInstanceNumeric() { Clear(); }; - - Instance* Copy() { - CHECK(false) << "Not implemented."; - return NULL; - } - - int size() { return form_ids_.size(); }; - - void Clear() { - DependencyInstanceNumeric::Clear(); - predicate_ids_.clear(); - predicate_indices_.clear(); - for (int j = 0; j < argument_role_ids_.size(); ++j) { - argument_role_ids_[j].clear(); - } - argument_indices_.clear(); - for (int j = 0; j < argument_indices_.size(); ++j) { - argument_indices_[j].clear(); - } - argument_indices_.clear(); - DeleteIndices(); - - // List of dependents, left and right siblings. - for (int h = 0; h < modifiers_.size(); ++h) { - modifiers_[h].clear(); - left_siblings_[h] = -1; - right_siblings_[h] = -1; - } - - // Relation and POS dependency paths. - for (int p = 0; p < relation_path_ids_.size(); ++p) { - relation_path_ids_[p].clear(); - pos_path_ids_[p].clear(); - } - relation_path_ids_.clear(); - pos_path_ids_.clear(); - } - - void Initialize(const SemanticDictionary &dictionary, - SemanticInstance *instance); - - void ComputeDependencyInformation(const SemanticDictionary &dictionary, - SemanticInstance *instance); - - bool ComputePassiveVoice(SemanticInstance *instance, int index); - - void DeleteIndices() { - index_predicates_.clear(); - for (int p = 0; p < index_arcs_.size(); ++p) { - index_arcs_[p].clear(); - } - index_arcs_.clear(); - } - - void BuildIndices() { - DeleteIndices(); - int length = size(); - index_predicates_.resize(length, -1); - index_arcs_.resize(length); - for (int p = 0; p < index_arcs_.size(); ++p) { - index_arcs_[p].resize(length, -1); - } - for (int k = 0; k < GetNumPredicates(); ++k) { - int p = GetPredicateIndex(k); - index_predicates_[p] = k; - for (int l = 0; l < GetNumArgumentsPredicate(k); ++l) { - int a = GetArgumentIndex(k, l); - index_arcs_[p][a] = l; - } - } - } - - const vector &GetPredicateIds() const { return predicate_ids_; } - const vector &GetPredicateIndices() const { return predicate_indices_; } - const vector > &GetArgumentRoleIds() const { - return argument_role_ids_; - } - const vector > &GetArgumentIndices() const { - return argument_indices_; - } - - int GetNumPredicates() { return predicate_ids_.size(); } - int GetPredicateId(int k) { return predicate_ids_[k]; } - int GetPredicateIndex(int k) { return predicate_indices_[k]; } - int GetNumArgumentsPredicate(int k) { return argument_role_ids_[k].size(); } - int GetArgumentRoleId(int k, int l) { return argument_role_ids_[k][l]; } - int GetArgumentIndex(int k, int l) { return argument_indices_[k][l]; } - - int FindPredicate(int p) { return index_predicates_[p]; } - int FindArc(int p, int a) { return index_arcs_[p][a]; } - - bool IsPassiveVoice(int p) { return is_passive_voice_[p]; } - const vector &GetModifiers(int h) { return modifiers_[h]; } - int GetLeftSibling(int h) { return left_siblings_[h]; } - int GetRightSibling(int h) { return right_siblings_[h]; } - int GetRelationPathId(int p, int a) { return relation_path_ids_[p][a]; } - int GetPosPathId(int p, int a) { return pos_path_ids_[p][a]; } - -private: - vector predicate_ids_; - vector predicate_indices_; - vector > argument_role_ids_; - vector > argument_indices_; - vector > relation_path_ids_; - vector > pos_path_ids_; - - vector index_predicates_; - vector > index_arcs_; - - vector > modifiers_; - vector left_siblings_; - vector right_siblings_; - vector is_passive_voice_; -}; - -#endif /* SEMANTICINSTANCENUMERIC_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEMANTICINSTANCENUMERIC_H_ +#define SEMANTICINSTANCENUMERIC_H_ + +#include +#include +#include "Dictionary.h" +#include "DependencyInstanceNumeric.h" +#include "SemanticInstance.h" +#include "SemanticDictionary.h" + +using namespace std; + +class SemanticInstanceNumeric : public DependencyInstanceNumeric { +public: + SemanticInstanceNumeric() {}; + virtual ~SemanticInstanceNumeric() { Clear(); }; + + Instance* Copy() { + CHECK(false) << "Not implemented."; + return NULL; + } + + int size() { return (int)form_ids_.size(); }; + + void Clear() { + DependencyInstanceNumeric::Clear(); + predicate_ids_.clear(); + predicate_indices_.clear(); + for (int j = 0; j < argument_role_ids_.size(); ++j) { + argument_role_ids_[j].clear(); + } + argument_indices_.clear(); + for (int j = 0; j < argument_indices_.size(); ++j) { + argument_indices_[j].clear(); + } + argument_indices_.clear(); + DeleteIndices(); + + // List of dependents, left and right siblings. + for (int h = 0; h < modifiers_.size(); ++h) { + modifiers_[h].clear(); + left_siblings_[h] = -1; + right_siblings_[h] = -1; + } + + // Relation and POS dependency paths. + for (int p = 0; p < relation_path_ids_.size(); ++p) { + relation_path_ids_[p].clear(); + pos_path_ids_[p].clear(); + } + relation_path_ids_.clear(); + pos_path_ids_.clear(); + } + + void Initialize(const SemanticDictionary &dictionary, + SemanticInstance *instance); + + void ComputeDependencyInformation(const SemanticDictionary &dictionary, + SemanticInstance *instance); + + bool ComputePassiveVoice(SemanticInstance *instance, int index); + + void DeleteIndices() { + index_predicates_.clear(); + for (int p = 0; p < index_arcs_.size(); ++p) { + index_arcs_[p].clear(); + } + index_arcs_.clear(); + } + + void BuildIndices() { + DeleteIndices(); + int length = size(); + index_predicates_.resize(length, -1); + index_arcs_.resize(length); + for (int p = 0; p < index_arcs_.size(); ++p) { + index_arcs_[p].resize(length, -1); + } + for (int k = 0; k < GetNumPredicates(); ++k) { + int p = GetPredicateIndex(k); + index_predicates_[p] = k; + for (int l = 0; l < GetNumArgumentsPredicate(k); ++l) { + int a = GetArgumentIndex(k, l); + index_arcs_[p][a] = l; + } + } + } + + const vector &GetPredicateIds() const { return predicate_ids_; } + const vector &GetPredicateIndices() const { return predicate_indices_; } + const vector > &GetArgumentRoleIds() const { + return argument_role_ids_; + } + const vector > &GetArgumentIndices() const { + return argument_indices_; + } + + int GetNumPredicates() { return (int) predicate_ids_.size(); } + int GetPredicateId(int k) { return predicate_ids_[k]; } + int GetPredicateIndex(int k) { return predicate_indices_[k]; } + int GetNumArgumentsPredicate(int k) { return (int) argument_role_ids_[k].size(); } + int GetArgumentRoleId(int k, int l) { return argument_role_ids_[k][l]; } + int GetArgumentIndex(int k, int l) { return argument_indices_[k][l]; } + + int FindPredicate(int p) { return index_predicates_[p]; } + int FindArc(int p, int a) { return index_arcs_[p][a]; } + + bool IsPassiveVoice(int p) { return is_passive_voice_[p]; } + const vector &GetModifiers(int h) { return modifiers_[h]; } + int GetLeftSibling(int h) { return left_siblings_[h]; } + int GetRightSibling(int h) { return right_siblings_[h]; } + int GetRelationPathId(int p, int a) { return relation_path_ids_[p][a]; } + int GetPosPathId(int p, int a) { return pos_path_ids_[p][a]; } + +private: + vector predicate_ids_; + vector predicate_indices_; + vector > argument_role_ids_; + vector > argument_indices_; + vector > relation_path_ids_; + vector > pos_path_ids_; + + vector index_predicates_; + vector > index_arcs_; + + vector > modifiers_; + vector left_siblings_; + vector right_siblings_; + vector is_passive_voice_; +}; + +#endif /* SEMANTICINSTANCENUMERIC_H_ */ diff --git a/src/semantic_parser/SemanticPart.h b/src/semantic_parser/SemanticPart.h index 3eca8b4..76567f7 100644 --- a/src/semantic_parser/SemanticPart.h +++ b/src/semantic_parser/SemanticPart.h @@ -1,605 +1,605 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEMANTICPART_H_ -#define SEMANTICPART_H_ - -#include -#include -#include "Part.h" - -using namespace std; - -enum { - SEMANTICPART_PREDICATE = 0, - SEMANTICPART_ARC, - SEMANTICPART_LABELEDARC, - SEMANTICPART_ARGUMENT, - SEMANTICPART_SIBLING, - SEMANTICPART_LABELEDSIBLING, - SEMANTICPART_CONSECUTIVESIBLING, - SEMANTICPART_GRANDPARENT, - SEMANTICPART_COPARENT, - SEMANTICPART_CONSECUTIVECOPARENT, - NUM_SEMANTICPARTS -}; - -// Part for an unlabeled arc linking a predicate and an argument word. -class SemanticPartArc : public Part { -public: - SemanticPartArc() { p_ = a_ = s_ = -1; } - SemanticPartArc(int predicate, int argument, int sense) : - p_(predicate), a_(argument), s_(sense) {} - virtual ~SemanticPartArc() {} - -public: - int predicate() { return p_; } - int argument() { return a_; } - int sense() { return s_; } - -public: - int type() { return SEMANTICPART_ARC; } - -private: - int p_; // Index of the predicate. - int a_; // Index of the argument. - int s_; // Predicate sense. -}; - -// Part for a labeled arc linking a predicate and an argument word. -class SemanticPartLabeledArc : public Part { -public: - SemanticPartLabeledArc() { p_ = a_ = s_ = r_ = -1; } - SemanticPartLabeledArc(int predicate, int argument, int sense, int role) : - p_(predicate), a_(argument), s_(sense), r_(role) {} - virtual ~SemanticPartLabeledArc() {} - -public: - int predicate() { return p_; } - int argument() { return a_; } - int sense() { return s_; } - int role() { return r_; } - -public: - int type() { return SEMANTICPART_LABELEDARC; } - -private: - int p_; // Index of the predicate. - int a_; // Index of the argument. - int s_; // Predicate sense. - int r_; // Role label. -}; - -// Part for the event that a word is a predicate. -class SemanticPartPredicate : public Part { -public: - SemanticPartPredicate() { p_ = -1; s_ = -1; } - SemanticPartPredicate(int predicate, int sense) : - p_(predicate), s_(sense) {} - virtual ~SemanticPartPredicate() {} - -public: - int predicate() { return p_; } - int sense() { return s_; } - -public: - int type() { return SEMANTICPART_PREDICATE; } - -private: - int p_; // Index of the predicate. - int s_; // Index of the sense. -}; - -// Part for the event that a word is an argument of at least one predicate. -class SemanticPartArgument : public Part { -public: - SemanticPartArgument() { a_ = -1; } - SemanticPartArgument(int argument) : - a_(argument) {} - virtual ~SemanticPartArgument() {} - -public: - int argument() { return a_; } - -public: - int type() { return SEMANTICPART_ARGUMENT; } - -private: - int a_; // Index of the argument. -}; - -class SemanticPartSibling : public Part { -public: - SemanticPartSibling() { p_ = s_ = a1_ = a2_ = -1; }; - SemanticPartSibling(int predicate, int sense, int first_argument, - int second_argument) { - p_ = predicate; - s_ = sense; - a1_ = first_argument; - a2_ = second_argument; - } - virtual ~SemanticPartSibling() {}; - -public: - int type() { return SEMANTICPART_SIBLING; }; - -public: - int predicate() { return p_; }; - int sense() { return s_; }; - int first_argument() { return a1_; }; - int second_argument() { return a2_; }; - -private: - int p_; // Index of the predicate. - int s_; // Index of the sense. - int a1_; // Index of the first argument. - int a2_; // Index of the second_argument. -}; - -class SemanticPartConsecutiveSibling : public Part { -public: - SemanticPartConsecutiveSibling() { p_ = s_ = a1_ = a2_ = -1; }; - SemanticPartConsecutiveSibling(int predicate, int sense, int first_argument, - int second_argument) { - p_ = predicate; - s_ = sense; - a1_ = first_argument; - a2_ = second_argument; - } - virtual ~SemanticPartConsecutiveSibling() {}; - -public: - int type() { return SEMANTICPART_CONSECUTIVESIBLING; }; - -public: - int predicate() { return p_; }; - int sense() { return s_; }; - int first_argument() { return a1_; }; - int second_argument() { return a2_; }; - -private: - int p_; // Index of the predicate. - int s_; // Index of the sense. - int a1_; // Index of the first argument (or -1 for a2_ being the first child). - int a2_; // Index of the second_argument (or -1 for a1_ being the last child). -}; - -class SemanticPartGrandparent : public Part { -public: - SemanticPartGrandparent() { g_ = t_ = p_ = s_ = a_ = -1; }; - SemanticPartGrandparent(int grandparent_predicate, int grandparent_sense, - int predicate, int sense, int argument) { - g_ = grandparent_predicate; - t_ = grandparent_sense; - p_ = predicate; - s_ = sense; - a_ = argument; - } - virtual ~SemanticPartGrandparent() {}; - -public: - int type() { return SEMANTICPART_GRANDPARENT; }; - -public: - int grandparent_predicate() { return g_; }; - int grandparent_sense() { return t_; }; - int predicate() { return p_; }; - int sense() { return s_; }; - int argument() { return a_; }; - -private: - int g_; // Index of the grandparent predicate. - int t_; // Index of the grandparent sense. - int p_; // Index of the predicate. - int s_; // Index of the sense. - int a_; // Index of the argument. -}; - -class SemanticPartCoparent : public Part { -public: - SemanticPartCoparent() { p1_ = s1_ = p2_ = s2_ = a_ = -1; }; - SemanticPartCoparent(int first_predicate, int first_sense, - int second_predicate, int second_sense, - int argument) { - p1_ = first_predicate; - s1_ = first_sense; - p2_ = second_predicate; - s2_ = second_sense; - a_ = argument; - } - virtual ~SemanticPartCoparent() {}; - -public: - int type() { return SEMANTICPART_COPARENT; }; - -public: - int first_predicate() { return p1_; }; - int first_sense() { return s1_; }; - int second_predicate() { return p2_; }; - int second_sense() { return s2_; }; - int argument() { return a_; }; - -private: - int p1_; // Index of the first predicate. - int s1_; // Index of the first sense. - int p2_; // Index of the second predicate. - int s2_; // Index of the second sense. - int a_; // Index of the argument. -}; - -class SemanticPartConsecutiveCoparent : public Part { -public: - SemanticPartConsecutiveCoparent() { p1_ = s1_ = p2_ = s2_ = a_ = -1; }; - SemanticPartConsecutiveCoparent(int first_predicate, int first_sense, - int second_predicate, int second_sense, - int argument) { - p1_ = first_predicate; - s1_ = first_sense; - p2_ = second_predicate; - s2_ = second_sense; - a_ = argument; - } - virtual ~SemanticPartConsecutiveCoparent() {}; - -public: - int type() { return SEMANTICPART_CONSECUTIVECOPARENT; }; - -public: - int first_predicate() { return p1_; }; - int first_sense() { return s1_; }; - int second_predicate() { return p2_; }; - int second_sense() { return s2_; }; - int argument() { return a_; }; - -private: - int p1_; // Index of the first predicate (or -1 for p2_ being the first). - int s1_; // Index of the first sense. - int p2_; // Index of the second predicate (or -1 for p1_ being the last). - int s2_; // Index of the second sense. - int a_; // Index of the argument. -}; - -class SemanticPartLabeledSibling : public Part { -public: - SemanticPartLabeledSibling() { p_ = s_ = a1_ = a2_ = r1_ = r2_ = -1; }; - SemanticPartLabeledSibling(int predicate, int sense, int first_argument, - int second_argument, int first_role, - int second_role) { - p_ = predicate; - s_ = sense; - a1_ = first_argument; - a2_ = second_argument; - r1_ = first_role; - r2_ = second_role; - } - virtual ~SemanticPartLabeledSibling() {}; - -public: - int type() { return SEMANTICPART_LABELEDSIBLING; }; - -public: - int predicate() { return p_; }; - int sense() { return s_; }; - int first_argument() { return a1_; }; - int second_argument() { return a2_; }; - int first_role() { return r1_; }; - int second_role() { return r2_; }; - -private: - int p_; // Index of the predicate. - int s_; // Index of the sense. - int a1_; // Index of the first argument. - int a2_; // Index of the second_argument. - int r1_; // First argument's role label. - int r2_; // Second_argument's role label. -}; - -class SemanticParts : public Parts { -public: - SemanticParts() {}; - virtual ~SemanticParts() { DeleteAll(); }; - - void Initialize() { - DeleteAll(); - for (int i = 0; i < NUM_SEMANTICPARTS; ++i) { - offsets_[i] = -1; - } - for (int r = 0; r < all_labeled_parts_.size(); ++r) { - all_labeled_parts_[r].clear(); - } - all_labeled_parts_.clear(); - } - - Part *CreatePartArc(int predicate, int argument, int sense) { - return new SemanticPartArc(predicate, argument, sense); - } - Part *CreatePartLabeledArc(int predicate, int argument, int sense, int role) { - return new SemanticPartLabeledArc(predicate, argument, sense, role); - } - Part *CreatePartPredicate(int predicate, int sense) { - return new SemanticPartPredicate(predicate, sense); - } - Part *CreatePartArgument(int argument) { - return new SemanticPartArgument(argument); - } - Part *CreatePartSibling(int predicate, - int sense, - int first_argument, - int second_argument) { - return new SemanticPartSibling(predicate, sense, first_argument, - second_argument); - } - Part *CreatePartLabeledSibling(int predicate, - int sense, - int first_argument, - int second_argument, - int first_role, - int second_role) { - return new SemanticPartLabeledSibling(predicate, sense, first_argument, - second_argument, first_role, - second_role); - } - Part *CreatePartConsecutiveSibling(int predicate, - int sense, - int first_argument, - int second_argument) { - return new SemanticPartConsecutiveSibling(predicate, sense, first_argument, - second_argument); - } - Part *CreatePartGrandparent(int grandparent_predicate, - int grandparent_sense, - int predicate, - int sense, - int argument) { - return new SemanticPartGrandparent(grandparent_predicate, grandparent_sense, - predicate, sense, argument); - } - Part *CreatePartCoparent(int first_predicate, - int first_sense, - int second_predicate, - int second_sense, - int argument) { - return new SemanticPartCoparent(first_predicate, first_sense, - second_predicate, second_sense, - argument); - } - Part *CreatePartConsecutiveCoparent(int first_predicate, - int first_sense, - int second_predicate, - int second_sense, - int argument) { - return new SemanticPartConsecutiveCoparent(first_predicate, first_sense, - second_predicate, second_sense, - argument); - } - - // Append a part to the array of parts. Return the index. - int AddPart(Part *part) { - int r = size(); - push_back(part); - all_labeled_parts_.push_back(vector(0)); - //LOG(INFO) << "Adding part #" << r << " with type " << part->type(); - CHECK_EQ(size(), all_labeled_parts_.size()); - return r; - } - - // Append a "labeled" part to the array of parts, providing the corresponding - // index of the unlabeled version of the part. Return the index. - int AddLabeledPart(Part *part, int unlabeled_part_index) { - int r = AddPart(part); - if (unlabeled_part_index >= 0) { - CHECK_LT(unlabeled_part_index, all_labeled_parts_.size()); - all_labeled_parts_[unlabeled_part_index].push_back(r); - } - return r; - } - - // Resize (necessary after pruning). - void Resize(int num_parts) { - resize(num_parts); - all_labeled_parts_.resize(num_parts); - } - -public: - void DeleteAll(); - -public: - void BuildIndices(int sentence_length, bool labeled); - void DeleteIndices(); - const vector &GetSenses(int predicate) { - return index_senses_[predicate]; - } - // Find an unlabeled arc (fast). - int FindArc(int predicate, int argument, int sense) { - CHECK_GE(predicate, 0); - CHECK_GE(argument, 0); - CHECK_GE(sense, 0); - CHECK_LT(predicate, index_.size()); - CHECK_LT(argument, index_[predicate].size()); - if (sense >= index_[predicate][argument].size()) { - return -1; - } - return index_[predicate][argument][sense]; - } - // Find a labeled arc (this may be rather slow, since we're not indexing the - // labels). - int FindLabeledArc(int predicate, int argument, int sense, int role) { - const vector &index_labeled = FindLabeledArcs(predicate, - argument, - sense); - for (int k = 0; k < index_labeled.size(); ++k) { - SemanticPartLabeledArc *labeled_arc = - static_cast((*this)[index_labeled[k]]); - if (labeled_arc->role() == role) return index_labeled[k]; - } - return -1; - } - // Find all labeled arcs (fast). - const vector &FindLabeledArcs(int predicate, int argument, int sense) { - CHECK_GE(predicate, 0); - CHECK_GE(argument, 0); - CHECK_GE(sense, 0); - CHECK_LT(predicate, index_labeled_.size()); - CHECK_LT(argument, index_labeled_[predicate].size()); - CHECK_LT(sense, index_labeled_[predicate][argument].size()); - //if (sense >= index_labeled_[predicate][argument].size()) { - // return -1; - //} - return index_labeled_[predicate][argument][sense]; - } - - // Given an "unlabeled" part (indexed by r), get/set the corresponding indices - // of the labeled parts. - const vector &GetLabeledParts(int r) { - CHECK_GE(r, 0); - CHECK_LT(r, size()); - CHECK_EQ(size(), all_labeled_parts_.size()); - return all_labeled_parts_[r]; - } - void SetLabeledParts(int r, const vector &labeled_parts) { - all_labeled_parts_[r] = labeled_parts; - } - - // True is model is arc-factored, i.e., all parts are predicate parts or - // unlabeled arcs. - // TODO: change this to incorporate predicate parts. - bool IsArcFactored() { - int offset, num_predicate_parts, num_arcs; - GetOffsetPredicate(&offset, &num_predicate_parts); - GetOffsetArc(&offset, &num_arcs); - return (num_predicate_parts + num_arcs == size()); - } - - // True is model is arc-factored, i.e., all parts are unlabeled and labeled - // arcs. - // TODO: change this to incorporate predicate parts. - bool IsLabeledArcFactored() { - int offset, num_predicate_parts, num_arcs, num_labeled_arcs; - GetOffsetPredicate(&offset, &num_predicate_parts); - GetOffsetArc(&offset, &num_arcs); - GetOffsetLabeledArc(&offset, &num_labeled_arcs); - return (num_predicate_parts + num_arcs + num_labeled_arcs == size()); - } - - // Set/Get offsets: - void ClearOffsets() { - for (int i = 0; i < NUM_SEMANTICPARTS; ++i) { - offsets_[i] = -1; - } - } - - void BuildOffsets() { - for (int i = NUM_SEMANTICPARTS - 1; i >= 0; --i) { - if (offsets_[i] < 0 || offsets_[i] > size()) { - offsets_[i] = (i == NUM_SEMANTICPARTS - 1) ? size() : offsets_[i + 1]; - } - } - }; - - void SetOffsetLabeledArc(int offset, int size) { - SetOffset(SEMANTICPART_LABELEDARC, offset, size); - }; - void SetOffsetArc(int offset, int size) { - SetOffset(SEMANTICPART_ARC, offset, size); - }; - void SetOffsetPredicate(int offset, int size) { - SetOffset(SEMANTICPART_PREDICATE, offset, size); - }; - void SetOffsetArgument(int offset, int size) { - SetOffset(SEMANTICPART_ARGUMENT, offset, size); - }; - void SetOffsetSibling(int offset, int size) { - SetOffset(SEMANTICPART_SIBLING, offset, size); - }; - void SetOffsetLabeledSibling(int offset, int size) { - SetOffset(SEMANTICPART_LABELEDSIBLING, offset, size); - }; - void SetOffsetConsecutiveSibling(int offset, int size) { - SetOffset(SEMANTICPART_CONSECUTIVESIBLING, offset, size); - }; - void SetOffsetGrandparent(int offset, int size) { - SetOffset(SEMANTICPART_GRANDPARENT, offset, size); - }; - void SetOffsetCoparent(int offset, int size) { - SetOffset(SEMANTICPART_COPARENT, offset, size); - }; - void SetOffsetConsecutiveCoparent(int offset, int size) { - SetOffset(SEMANTICPART_CONSECUTIVECOPARENT, offset, size); - }; - - void GetOffsetLabeledArc(int *offset, int *size) const { - GetOffset(SEMANTICPART_LABELEDARC, offset, size); - }; - void GetOffsetArc(int *offset, int *size) const { - GetOffset(SEMANTICPART_ARC, offset, size); - }; - void GetOffsetPredicate(int *offset, int *size) const { - GetOffset(SEMANTICPART_PREDICATE, offset, size); - }; - void GetOffsetArgument(int *offset, int *size) const { - GetOffset(SEMANTICPART_ARGUMENT, offset, size); - }; - void GetOffsetSibling(int *offset, int *size) const { - GetOffset(SEMANTICPART_SIBLING, offset, size); - }; - void GetOffsetLabeledSibling(int *offset, int *size) const { - GetOffset(SEMANTICPART_LABELEDSIBLING, offset, size); - }; - void GetOffsetConsecutiveSibling(int *offset, int *size) const { - GetOffset(SEMANTICPART_CONSECUTIVESIBLING, offset, size); - }; - void GetOffsetGrandparent(int *offset, int *size) const { - GetOffset(SEMANTICPART_GRANDPARENT, offset, size); - }; - void GetOffsetCoparent(int *offset, int *size) const { - GetOffset(SEMANTICPART_COPARENT, offset, size); - }; - void GetOffsetConsecutiveCoparent(int *offset, int *size) const { - GetOffset(SEMANTICPART_CONSECUTIVECOPARENT, offset, size); - }; - -private: - // Get offset from part index. - void GetOffset(int i, int *offset, int *size) const { - *offset = offsets_[i]; - *size = (i < NUM_SEMANTICPARTS - 1) ? offsets_[i + 1] - (*offset) : - SemanticParts::size() - (*offset); - } - - // Set offset from part index. - void SetOffset(int i, int offset, int size) { - offsets_[i] = offset; - if (i < NUM_SEMANTICPARTS - 1) offsets_[i + 1] = offset + size; - } - -private: - // Sense IDs of each predicate. - vector > index_senses_; - // Maps a triple (p, a, s) to a SemanticPartArc index. - vector > > index_; - // Maps a quadruple (p, a, s, r) to a SemanticPartLabeledArc index. - // TODO: maybe replace this index by the general all_labeled_parts_ below? - vector > > > index_labeled_; - // Indices of the labeled parts corresponding to each unlabeled part. - // This vector should have the same size as the number of parts. - vector > all_labeled_parts_; - // Offsets for each part type. - int offsets_[NUM_SEMANTICPARTS]; -}; - -#endif /* SEMANTICPART_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEMANTICPART_H_ +#define SEMANTICPART_H_ + +#include +#include +#include "Part.h" + +using namespace std; + +enum { + SEMANTICPART_PREDICATE = 0, + SEMANTICPART_ARC, + SEMANTICPART_LABELEDARC, + SEMANTICPART_ARGUMENT, + SEMANTICPART_SIBLING, + SEMANTICPART_LABELEDSIBLING, + SEMANTICPART_CONSECUTIVESIBLING, + SEMANTICPART_GRANDPARENT, + SEMANTICPART_COPARENT, + SEMANTICPART_CONSECUTIVECOPARENT, + NUM_SEMANTICPARTS +}; + +// Part for an unlabeled arc linking a predicate and an argument word. +class SemanticPartArc : public Part { +public: + SemanticPartArc() { p_ = a_ = s_ = -1; } + SemanticPartArc(int predicate, int argument, int sense) : + p_(predicate), a_(argument), s_(sense) {} + virtual ~SemanticPartArc() {} + +public: + int predicate() { return p_; } + int argument() { return a_; } + int sense() { return s_; } + +public: + int type() { return SEMANTICPART_ARC; } + +private: + int p_; // Index of the predicate. + int a_; // Index of the argument. + int s_; // Predicate sense. +}; + +// Part for a labeled arc linking a predicate and an argument word. +class SemanticPartLabeledArc : public Part { +public: + SemanticPartLabeledArc() { p_ = a_ = s_ = r_ = -1; } + SemanticPartLabeledArc(int predicate, int argument, int sense, int role) : + p_(predicate), a_(argument), s_(sense), r_(role) {} + virtual ~SemanticPartLabeledArc() {} + +public: + int predicate() { return p_; } + int argument() { return a_; } + int sense() { return s_; } + int role() { return r_; } + +public: + int type() { return SEMANTICPART_LABELEDARC; } + +private: + int p_; // Index of the predicate. + int a_; // Index of the argument. + int s_; // Predicate sense. + int r_; // Role label. +}; + +// Part for the event that a word is a predicate. +class SemanticPartPredicate : public Part { +public: + SemanticPartPredicate() { p_ = -1; s_ = -1; } + SemanticPartPredicate(int predicate, int sense) : + p_(predicate), s_(sense) {} + virtual ~SemanticPartPredicate() {} + +public: + int predicate() { return p_; } + int sense() { return s_; } + +public: + int type() { return SEMANTICPART_PREDICATE; } + +private: + int p_; // Index of the predicate. + int s_; // Index of the sense. +}; + +// Part for the event that a word is an argument of at least one predicate. +class SemanticPartArgument : public Part { +public: + SemanticPartArgument() { a_ = -1; } + SemanticPartArgument(int argument) : + a_(argument) {} + virtual ~SemanticPartArgument() {} + +public: + int argument() { return a_; } + +public: + int type() { return SEMANTICPART_ARGUMENT; } + +private: + int a_; // Index of the argument. +}; + +class SemanticPartSibling : public Part { +public: + SemanticPartSibling() { p_ = s_ = a1_ = a2_ = -1; }; + SemanticPartSibling(int predicate, int sense, int first_argument, + int second_argument) { + p_ = predicate; + s_ = sense; + a1_ = first_argument; + a2_ = second_argument; + } + virtual ~SemanticPartSibling() {}; + +public: + int type() { return SEMANTICPART_SIBLING; }; + +public: + int predicate() { return p_; }; + int sense() { return s_; }; + int first_argument() { return a1_; }; + int second_argument() { return a2_; }; + +private: + int p_; // Index of the predicate. + int s_; // Index of the sense. + int a1_; // Index of the first argument. + int a2_; // Index of the second_argument. +}; + +class SemanticPartConsecutiveSibling : public Part { +public: + SemanticPartConsecutiveSibling() { p_ = s_ = a1_ = a2_ = -1; }; + SemanticPartConsecutiveSibling(int predicate, int sense, int first_argument, + int second_argument) { + p_ = predicate; + s_ = sense; + a1_ = first_argument; + a2_ = second_argument; + } + virtual ~SemanticPartConsecutiveSibling() {}; + +public: + int type() { return SEMANTICPART_CONSECUTIVESIBLING; }; + +public: + int predicate() { return p_; }; + int sense() { return s_; }; + int first_argument() { return a1_; }; + int second_argument() { return a2_; }; + +private: + int p_; // Index of the predicate. + int s_; // Index of the sense. + int a1_; // Index of the first argument (or -1 for a2_ being the first child). + int a2_; // Index of the second_argument (or -1 for a1_ being the last child). +}; + +class SemanticPartGrandparent : public Part { +public: + SemanticPartGrandparent() { g_ = t_ = p_ = s_ = a_ = -1; }; + SemanticPartGrandparent(int grandparent_predicate, int grandparent_sense, + int predicate, int sense, int argument) { + g_ = grandparent_predicate; + t_ = grandparent_sense; + p_ = predicate; + s_ = sense; + a_ = argument; + } + virtual ~SemanticPartGrandparent() {}; + +public: + int type() { return SEMANTICPART_GRANDPARENT; }; + +public: + int grandparent_predicate() { return g_; }; + int grandparent_sense() { return t_; }; + int predicate() { return p_; }; + int sense() { return s_; }; + int argument() { return a_; }; + +private: + int g_; // Index of the grandparent predicate. + int t_; // Index of the grandparent sense. + int p_; // Index of the predicate. + int s_; // Index of the sense. + int a_; // Index of the argument. +}; + +class SemanticPartCoparent : public Part { +public: + SemanticPartCoparent() { p1_ = s1_ = p2_ = s2_ = a_ = -1; }; + SemanticPartCoparent(int first_predicate, int first_sense, + int second_predicate, int second_sense, + int argument) { + p1_ = first_predicate; + s1_ = first_sense; + p2_ = second_predicate; + s2_ = second_sense; + a_ = argument; + } + virtual ~SemanticPartCoparent() {}; + +public: + int type() { return SEMANTICPART_COPARENT; }; + +public: + int first_predicate() { return p1_; }; + int first_sense() { return s1_; }; + int second_predicate() { return p2_; }; + int second_sense() { return s2_; }; + int argument() { return a_; }; + +private: + int p1_; // Index of the first predicate. + int s1_; // Index of the first sense. + int p2_; // Index of the second predicate. + int s2_; // Index of the second sense. + int a_; // Index of the argument. +}; + +class SemanticPartConsecutiveCoparent : public Part { +public: + SemanticPartConsecutiveCoparent() { p1_ = s1_ = p2_ = s2_ = a_ = -1; }; + SemanticPartConsecutiveCoparent(int first_predicate, int first_sense, + int second_predicate, int second_sense, + int argument) { + p1_ = first_predicate; + s1_ = first_sense; + p2_ = second_predicate; + s2_ = second_sense; + a_ = argument; + } + virtual ~SemanticPartConsecutiveCoparent() {}; + +public: + int type() { return SEMANTICPART_CONSECUTIVECOPARENT; }; + +public: + int first_predicate() { return p1_; }; + int first_sense() { return s1_; }; + int second_predicate() { return p2_; }; + int second_sense() { return s2_; }; + int argument() { return a_; }; + +private: + int p1_; // Index of the first predicate (or -1 for p2_ being the first). + int s1_; // Index of the first sense. + int p2_; // Index of the second predicate (or -1 for p1_ being the last). + int s2_; // Index of the second sense. + int a_; // Index of the argument. +}; + +class SemanticPartLabeledSibling : public Part { +public: + SemanticPartLabeledSibling() { p_ = s_ = a1_ = a2_ = r1_ = r2_ = -1; }; + SemanticPartLabeledSibling(int predicate, int sense, int first_argument, + int second_argument, int first_role, + int second_role) { + p_ = predicate; + s_ = sense; + a1_ = first_argument; + a2_ = second_argument; + r1_ = first_role; + r2_ = second_role; + } + virtual ~SemanticPartLabeledSibling() {}; + +public: + int type() { return SEMANTICPART_LABELEDSIBLING; }; + +public: + int predicate() { return p_; }; + int sense() { return s_; }; + int first_argument() { return a1_; }; + int second_argument() { return a2_; }; + int first_role() { return r1_; }; + int second_role() { return r2_; }; + +private: + int p_; // Index of the predicate. + int s_; // Index of the sense. + int a1_; // Index of the first argument. + int a2_; // Index of the second_argument. + int r1_; // First argument's role label. + int r2_; // Second_argument's role label. +}; + +class SemanticParts : public Parts { +public: + SemanticParts() {}; + virtual ~SemanticParts() { DeleteAll(); }; + + void Initialize() { + DeleteAll(); + for (int i = 0; i < NUM_SEMANTICPARTS; ++i) { + offsets_[i] = -1; + } + for (int r = 0; r < all_labeled_parts_.size(); ++r) { + all_labeled_parts_[r].clear(); + } + all_labeled_parts_.clear(); + } + + Part *CreatePartArc(int predicate, int argument, int sense) { + return new SemanticPartArc(predicate, argument, sense); + } + Part *CreatePartLabeledArc(int predicate, int argument, int sense, int role) { + return new SemanticPartLabeledArc(predicate, argument, sense, role); + } + Part *CreatePartPredicate(int predicate, int sense) { + return new SemanticPartPredicate(predicate, sense); + } + Part *CreatePartArgument(int argument) { + return new SemanticPartArgument(argument); + } + Part *CreatePartSibling(int predicate, + int sense, + int first_argument, + int second_argument) { + return new SemanticPartSibling(predicate, sense, first_argument, + second_argument); + } + Part *CreatePartLabeledSibling(int predicate, + int sense, + int first_argument, + int second_argument, + int first_role, + int second_role) { + return new SemanticPartLabeledSibling(predicate, sense, first_argument, + second_argument, first_role, + second_role); + } + Part *CreatePartConsecutiveSibling(int predicate, + int sense, + int first_argument, + int second_argument) { + return new SemanticPartConsecutiveSibling(predicate, sense, first_argument, + second_argument); + } + Part *CreatePartGrandparent(int grandparent_predicate, + int grandparent_sense, + int predicate, + int sense, + int argument) { + return new SemanticPartGrandparent(grandparent_predicate, grandparent_sense, + predicate, sense, argument); + } + Part *CreatePartCoparent(int first_predicate, + int first_sense, + int second_predicate, + int second_sense, + int argument) { + return new SemanticPartCoparent(first_predicate, first_sense, + second_predicate, second_sense, + argument); + } + Part *CreatePartConsecutiveCoparent(int first_predicate, + int first_sense, + int second_predicate, + int second_sense, + int argument) { + return new SemanticPartConsecutiveCoparent(first_predicate, first_sense, + second_predicate, second_sense, + argument); + } + + // Append a part to the array of parts. Return the index. + int AddPart(Part *part) { + int r = (int)size(); + push_back(part); + all_labeled_parts_.push_back(vector(0)); + //LOG(INFO) << "Adding part #" << r << " with type " << part->type(); + CHECK_EQ(size(), all_labeled_parts_.size()); + return r; + } + + // Append a "labeled" part to the array of parts, providing the corresponding + // index of the unlabeled version of the part. Return the index. + int AddLabeledPart(Part *part, int unlabeled_part_index) { + int r = AddPart(part); + if (unlabeled_part_index >= 0) { + CHECK_LT(unlabeled_part_index, all_labeled_parts_.size()); + all_labeled_parts_[unlabeled_part_index].push_back(r); + } + return r; + } + + // Resize (necessary after pruning). + void Resize(int num_parts) { + resize(num_parts); + all_labeled_parts_.resize(num_parts); + } + +public: + void DeleteAll(); + +public: + void BuildIndices(int sentence_length, bool labeled); + void DeleteIndices(); + const vector &GetSenses(int predicate) { + return index_senses_[predicate]; + } + // Find an unlabeled arc (fast). + int FindArc(int predicate, int argument, int sense) { + CHECK_GE(predicate, 0); + CHECK_GE(argument, 0); + CHECK_GE(sense, 0); + CHECK_LT(predicate, index_.size()); + CHECK_LT(argument, index_[predicate].size()); + if (sense >= index_[predicate][argument].size()) { + return -1; + } + return index_[predicate][argument][sense]; + } + // Find a labeled arc (this may be rather slow, since we're not indexing the + // labels). + int FindLabeledArc(int predicate, int argument, int sense, int role) { + const vector &index_labeled = FindLabeledArcs(predicate, + argument, + sense); + for (int k = 0; k < index_labeled.size(); ++k) { + SemanticPartLabeledArc *labeled_arc = + static_cast((*this)[index_labeled[k]]); + if (labeled_arc->role() == role) return index_labeled[k]; + } + return -1; + } + // Find all labeled arcs (fast). + const vector &FindLabeledArcs(int predicate, int argument, int sense) { + CHECK_GE(predicate, 0); + CHECK_GE(argument, 0); + CHECK_GE(sense, 0); + CHECK_LT(predicate, index_labeled_.size()); + CHECK_LT(argument, index_labeled_[predicate].size()); + CHECK_LT(sense, index_labeled_[predicate][argument].size()); + //if (sense >= index_labeled_[predicate][argument].size()) { + // return -1; + //} + return index_labeled_[predicate][argument][sense]; + } + + // Given an "unlabeled" part (indexed by r), get/set the corresponding indices + // of the labeled parts. + const vector &GetLabeledParts(int r) { + CHECK_GE(r, 0); + CHECK_LT(r, size()); + CHECK_EQ(size(), all_labeled_parts_.size()); + return all_labeled_parts_[r]; + } + void SetLabeledParts(int r, const vector &labeled_parts) { + all_labeled_parts_[r] = labeled_parts; + } + + // True is model is arc-factored, i.e., all parts are predicate parts or + // unlabeled arcs. + // TODO: change this to incorporate predicate parts. + bool IsArcFactored() { + int offset, num_predicate_parts, num_arcs; + GetOffsetPredicate(&offset, &num_predicate_parts); + GetOffsetArc(&offset, &num_arcs); + return (num_predicate_parts + num_arcs == size()); + } + + // True is model is arc-factored, i.e., all parts are unlabeled and labeled + // arcs. + // TODO: change this to incorporate predicate parts. + bool IsLabeledArcFactored() { + int offset, num_predicate_parts, num_arcs, num_labeled_arcs; + GetOffsetPredicate(&offset, &num_predicate_parts); + GetOffsetArc(&offset, &num_arcs); + GetOffsetLabeledArc(&offset, &num_labeled_arcs); + return (num_predicate_parts + num_arcs + num_labeled_arcs == size()); + } + + // Set/Get offsets: + void ClearOffsets() { + for (int i = 0; i < NUM_SEMANTICPARTS; ++i) { + offsets_[i] = -1; + } + } + + void BuildOffsets() { + for (int i = NUM_SEMANTICPARTS - 1; i >= 0; --i) { + if (offsets_[i] < 0 || offsets_[i] > size()) { + offsets_[i] = (i == NUM_SEMANTICPARTS - 1) ? (int)size() : offsets_[i + 1]; + } + } + }; + + void SetOffsetLabeledArc(int offset, int size) { + SetOffset(SEMANTICPART_LABELEDARC, offset, size); + }; + void SetOffsetArc(int offset, int size) { + SetOffset(SEMANTICPART_ARC, offset, size); + }; + void SetOffsetPredicate(int offset, int size) { + SetOffset(SEMANTICPART_PREDICATE, offset, size); + }; + void SetOffsetArgument(int offset, int size) { + SetOffset(SEMANTICPART_ARGUMENT, offset, size); + }; + void SetOffsetSibling(int offset, int size) { + SetOffset(SEMANTICPART_SIBLING, offset, size); + }; + void SetOffsetLabeledSibling(int offset, int size) { + SetOffset(SEMANTICPART_LABELEDSIBLING, offset, size); + }; + void SetOffsetConsecutiveSibling(int offset, int size) { + SetOffset(SEMANTICPART_CONSECUTIVESIBLING, offset, size); + }; + void SetOffsetGrandparent(int offset, int size) { + SetOffset(SEMANTICPART_GRANDPARENT, offset, size); + }; + void SetOffsetCoparent(int offset, int size) { + SetOffset(SEMANTICPART_COPARENT, offset, size); + }; + void SetOffsetConsecutiveCoparent(int offset, int size) { + SetOffset(SEMANTICPART_CONSECUTIVECOPARENT, offset, size); + }; + + void GetOffsetLabeledArc(int *offset, int *size) const { + GetOffset(SEMANTICPART_LABELEDARC, offset, size); + }; + void GetOffsetArc(int *offset, int *size) const { + GetOffset(SEMANTICPART_ARC, offset, size); + }; + void GetOffsetPredicate(int *offset, int *size) const { + GetOffset(SEMANTICPART_PREDICATE, offset, size); + }; + void GetOffsetArgument(int *offset, int *size) const { + GetOffset(SEMANTICPART_ARGUMENT, offset, size); + }; + void GetOffsetSibling(int *offset, int *size) const { + GetOffset(SEMANTICPART_SIBLING, offset, size); + }; + void GetOffsetLabeledSibling(int *offset, int *size) const { + GetOffset(SEMANTICPART_LABELEDSIBLING, offset, size); + }; + void GetOffsetConsecutiveSibling(int *offset, int *size) const { + GetOffset(SEMANTICPART_CONSECUTIVESIBLING, offset, size); + }; + void GetOffsetGrandparent(int *offset, int *size) const { + GetOffset(SEMANTICPART_GRANDPARENT, offset, size); + }; + void GetOffsetCoparent(int *offset, int *size) const { + GetOffset(SEMANTICPART_COPARENT, offset, size); + }; + void GetOffsetConsecutiveCoparent(int *offset, int *size) const { + GetOffset(SEMANTICPART_CONSECUTIVECOPARENT, offset, size); + }; + +private: + // Get offset from part index. + void GetOffset(int i, int *offset, int *size) const { + *offset = offsets_[i]; + *size = (i < NUM_SEMANTICPARTS - 1) ? offsets_[i + 1] - (*offset) : + (int)SemanticParts::size() - (*offset); + } + + // Set offset from part index. + void SetOffset(int i, int offset, int size) { + offsets_[i] = offset; + if (i < NUM_SEMANTICPARTS - 1) offsets_[i + 1] = offset + size; + } + +private: + // Sense IDs of each predicate. + vector > index_senses_; + // Maps a triple (p, a, s) to a SemanticPartArc index. + vector > > index_; + // Maps a quadruple (p, a, s, r) to a SemanticPartLabeledArc index. + // TODO: maybe replace this index by the general all_labeled_parts_ below? + vector > > > index_labeled_; + // Indices of the labeled parts corresponding to each unlabeled part. + // This vector should have the same size as the number of parts. + vector > all_labeled_parts_; + // Offsets for each part type. + int offsets_[NUM_SEMANTICPARTS]; +}; + +#endif /* SEMANTICPART_H_ */ diff --git a/src/semantic_parser/SemanticPipe.cpp b/src/semantic_parser/SemanticPipe.cpp index 286c7d2..af6ca50 100644 --- a/src/semantic_parser/SemanticPipe.cpp +++ b/src/semantic_parser/SemanticPipe.cpp @@ -1,1889 +1,1887 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "SemanticPipe.h" -#include -#include -#include -#include -#ifndef _WIN32 -#include -#else -#include -#endif -using namespace std; - -// Define the current model version and the oldest back-compatible version. -// The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0". -const uint64_t kSemanticParserModelVersion = 200030000; -const uint64_t kOldestCompatibleSemanticParserModelVersion = 200030000; -const uint64_t kSemanticParserModelCheck = 1234567890; - -DEFINE_bool(use_only_labeled_arc_features, true, - "True for not using unlabeled arc features in addition to labeled ones."); -DEFINE_bool(use_only_labeled_sibling_features, false, //true, - "True for not using unlabeled sibling features in addition to labeled ones."); -DEFINE_bool(use_labeled_sibling_features, false, //true, - "True for using labels in sibling features."); - -void SemanticPipe::SaveModel(FILE* fs) { - bool success; - success = WriteUINT64(fs, kSemanticParserModelCheck); - CHECK(success); - success = WriteUINT64(fs, kSemanticParserModelVersion); - CHECK(success); - token_dictionary_->Save(fs); - dependency_dictionary_->Save(fs); - Pipe::SaveModel(fs); - pruner_parameters_->Save(fs); -} - -void SemanticPipe::LoadModel(FILE* fs) { - bool success; - uint64_t model_check; - uint64_t model_version; - success = ReadUINT64(fs, &model_check); - CHECK(success); - CHECK_EQ(model_check, kSemanticParserModelCheck) - << "The model file is too old and not supported anymore."; - success = ReadUINT64(fs, &model_version); - CHECK(success); - CHECK_GE(model_version, kOldestCompatibleSemanticParserModelVersion) - << "The model file is too old and not supported anymore."; - delete token_dictionary_; - CreateTokenDictionary(); - static_cast(dictionary_)-> - SetTokenDictionary(token_dictionary_); - token_dictionary_->Load(fs); - CreateDependencyDictionary(); - dependency_dictionary_->SetTokenDictionary(token_dictionary_); - static_cast(dictionary_)-> - SetDependencyDictionary(dependency_dictionary_); - dependency_dictionary_->Load(fs); - Pipe::LoadModel(fs); - pruner_parameters_->Load(fs); -} - -void SemanticPipe::LoadPrunerModel(FILE* fs) { - LOG(INFO) << "Loading pruner model..."; - // This will be ignored but must be passed to the pruner pipe constructor, - // so that when loading the pruner model the actual options are not - // overwritten. - SemanticOptions pruner_options; // = *options_; - SemanticPipe* pipe = new SemanticPipe(&pruner_options); - //SemanticPipe* pipe = new SemanticPipe(options_); - pipe->Initialize(); - pipe->LoadModel(fs); - delete pruner_parameters_; - pruner_parameters_ = pipe->parameters_; - pipe->parameters_ = NULL; - delete pipe; - LOG(INFO) << "Done."; -} - -void SemanticPipe::LoadPrunerModelByName(const string &model_name) { - FILE *fs = fopen(model_name.c_str(), "rb"); - CHECK(fs) << "Could not open pruner model file for reading: " << model_name; - LoadPrunerModel(fs); - fclose(fs); -} - -void SemanticPipe::PreprocessData() { - delete token_dictionary_; - CreateTokenDictionary(); - static_cast(dictionary_)->SetTokenDictionary(token_dictionary_); - static_cast(token_dictionary_)->Initialize(GetSemanticReader()); - delete dependency_dictionary_; - CreateDependencyDictionary(); - dependency_dictionary_->SetTokenDictionary(token_dictionary_); - static_cast(dictionary_)->SetDependencyDictionary(dependency_dictionary_); - dependency_dictionary_->CreateLabelDictionary(GetSemanticReader()); - static_cast(dictionary_)->CreatePredicateRoleDictionaries(GetSemanticReader()); -} - -void SemanticPipe::ComputeScores(Instance *instance, Parts *parts, - Features *features, - bool pruner, - vector *scores) { - Parameters *parameters; - SemanticDictionary *semantic_dictionary = - static_cast(dictionary_); - SemanticFeatures *semantic_features = - static_cast(features); - if (pruner) { - parameters = pruner_parameters_; - } else { - parameters = parameters_; - } - scores->resize(parts->size()); - SemanticParts *semantic_parts = static_cast(parts); - for (int r = 0; r < parts->size(); ++r) { - bool has_unlabeled_features = - (semantic_features->GetNumPartFeatures(r) > 0); - bool has_labeled_features = - (semantic_features->GetNumLabeledPartFeatures(r) > 0); - - if (pruner) CHECK((*parts)[r]->type() == SEMANTICPART_ARC || - (*parts)[r]->type() == SEMANTICPART_PREDICATE); - if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; - if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; - - // Compute scores for the unlabeled features. - if (has_unlabeled_features) { - const BinaryFeatures &part_features = - semantic_features->GetPartFeatures(r); - (*scores)[r] = parameters->ComputeScore(part_features); - } else { - (*scores)[r] = 0.0; - } - - // Compute scores for the labeled features. - if ((*parts)[r]->type() == SEMANTICPART_ARC && !pruner && - GetSemanticOptions()->labeled()) { - // Labeled arcs will be treated by looking at the unlabeled arcs and - // conjoining with the label. - CHECK(has_labeled_features); - SemanticPartArc *arc = static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->FindLabeledArcs(arc->predicate(), - arc->argument(), - arc->sense()); - vector allowed_labels(index_labeled_parts.size()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - CHECK_GE(index_labeled_parts[k], 0); - CHECK_LT(index_labeled_parts[k], parts->size()); - SemanticPartLabeledArc *labeled_arc = - static_cast( - (*parts)[index_labeled_parts[k]]); - CHECK(labeled_arc != NULL); - allowed_labels[k] = labeled_arc->role(); - } - vector label_scores; - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - parameters->ComputeLabelScores(part_features, allowed_labels, - &label_scores); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - (*scores)[index_labeled_parts[k]] = label_scores[k]; - } - } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && - has_labeled_features) { - // Labeled siblings will be treated by looking at the unlabeled ones and - // conjoining with the label. - CHECK(!pruner); - CHECK(GetSemanticOptions()->labeled()); - SemanticPartSibling *sibling = - static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->GetLabeledParts(r); - vector bigram_labels(index_labeled_parts.size()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - CHECK_GE(index_labeled_parts[k], 0); - CHECK_LT(index_labeled_parts[k], parts->size()); - SemanticPartLabeledSibling *labeled_sibling = - static_cast( - (*parts)[index_labeled_parts[k]]); - CHECK(labeled_sibling != NULL); - bigram_labels[k] = semantic_dictionary->GetRoleBigramLabel( - labeled_sibling->first_role(), - labeled_sibling->second_role()); - } - vector label_scores; - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - parameters->ComputeLabelScores(part_features, bigram_labels, - &label_scores); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - (*scores)[index_labeled_parts[k]] = label_scores[k]; - } - } - } -} - -void SemanticPipe::RemoveUnsupportedFeatures(Instance *instance, Parts *parts, - bool pruner, - const vector &selected_parts, - Features *features) { - Parameters *parameters; - SemanticFeatures *semantic_features = - static_cast(features); - if (pruner) { - parameters = pruner_parameters_; - } else { - parameters = parameters_; - } - - for (int r = 0; r < parts->size(); ++r) { - // TODO: Make sure we can do this continue for the labeled parts... - if (!selected_parts[r]) continue; - - bool has_unlabeled_features = - (semantic_features->GetNumPartFeatures(r) > 0); - bool has_labeled_features = - (semantic_features->GetNumLabeledPartFeatures(r) > 0); - - if (pruner) CHECK((*parts)[r]->type() == SEMANTICPART_ARC || - (*parts)[r]->type() == SEMANTICPART_PREDICATE); - - // TODO(atm): I think this is handling the case there can be labeled - // features, but was never tested. - CHECK(!has_labeled_features); - - // Skip labeled arcs, as they use the features from unlabeled arcs. - if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; - if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; - - if (has_unlabeled_features) { - BinaryFeatures *part_features = - semantic_features->GetMutablePartFeatures(r); - int num_supported = 0; - for (int j = 0; j < part_features->size(); ++j) { - if (parameters->Exists((*part_features)[j])) { - (*part_features)[num_supported] = (*part_features)[j]; - ++num_supported; - } - } - part_features->resize(num_supported); - } - - if (has_labeled_features) { - BinaryFeatures *part_features = - semantic_features->GetMutableLabeledPartFeatures(r); - int num_supported = 0; - for (int j = 0; j < part_features->size(); ++j) { - if (parameters->ExistsLabeled((*part_features)[j])) { - (*part_features)[num_supported] = (*part_features)[j]; - ++num_supported; - } - } - part_features->resize(num_supported); - } - } -} - -void SemanticPipe::MakeGradientStep(Parts *parts, - Features *features, - double eta, - int iteration, - const vector &gold_output, - const vector &predicted_output) { - SemanticParts *semantic_parts = static_cast(parts); - SemanticDictionary *semantic_dictionary = - static_cast(dictionary_); - SemanticFeatures *semantic_features = - static_cast(features); - Parameters *parameters = GetTrainingParameters(); - - for (int r = 0; r < parts->size(); ++r) { - bool has_unlabeled_features = - (semantic_features->GetNumPartFeatures(r) > 0); - bool has_labeled_features = - (semantic_features->GetNumLabeledPartFeatures(r) > 0); - - if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; - if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; - - // Make updates for the unlabeled features. - if (has_unlabeled_features) { - if (predicted_output[r] != gold_output[r]) { - const BinaryFeatures &part_features = - semantic_features->GetPartFeatures(r); - parameters->MakeGradientStep(part_features, eta, iteration, - predicted_output[r] - gold_output[r]); - } - } - - // Make updates for the labeled features. - if ((*parts)[r]->type() == SEMANTICPART_ARC && has_labeled_features) { - // Labeled arcs will be treated by looking at the unlabeled arcs and - // conjoining with the label. - CHECK(has_labeled_features); - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - SemanticPartArc *arc = static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->FindLabeledArcs(arc->predicate(), - arc->argument(), - arc->sense()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int index_part = index_labeled_parts[k]; - CHECK_GE(index_part, 0); - CHECK_LT(index_part, parts->size()); - SemanticPartLabeledArc *labeled_arc = - static_cast((*parts)[index_part]); - CHECK(labeled_arc != NULL); - double value = predicted_output[index_part] - gold_output[index_part]; - if (value != 0.0) { - parameters->MakeLabelGradientStep(part_features, eta, iteration, - labeled_arc->role(), - value); - } - } - } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && - has_labeled_features) { - // Labeled siblings will be treated by looking at the unlabeled ones and - // conjoining with the label. - CHECK(GetSemanticOptions()->labeled()); - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - SemanticPartSibling *sibling = - static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->GetLabeledParts(r); - vector bigram_labels(index_labeled_parts.size()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int index_part = index_labeled_parts[k]; - CHECK_GE(index_part, 0); - CHECK_LT(index_part, parts->size()); - SemanticPartLabeledSibling *labeled_sibling = - static_cast( - (*parts)[index_part]); - CHECK(labeled_sibling != NULL); - int bigram_label = semantic_dictionary->GetRoleBigramLabel( - labeled_sibling->first_role(), - labeled_sibling->second_role()); - double value = predicted_output[index_part] - gold_output[index_part]; - if (value != 0.0) { - parameters->MakeLabelGradientStep(part_features, eta, iteration, - bigram_label, value); - } - } - } - } -} - -void SemanticPipe::TouchParameters(Parts *parts, Features *features, - const vector &selected_parts) { - SemanticParts *semantic_parts = static_cast(parts); - SemanticDictionary *semantic_dictionary = - static_cast(dictionary_); - SemanticFeatures *semantic_features = - static_cast(features); - Parameters *parameters = GetTrainingParameters(); - - for (int r = 0; r < parts->size(); ++r) { - // TODO: Make sure we can do this continue for the labeled parts... - if (!selected_parts[r]) continue; - - bool has_unlabeled_features = - (semantic_features->GetNumPartFeatures(r) > 0); - bool has_labeled_features = - (semantic_features->GetNumLabeledPartFeatures(r) > 0); - - if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; - if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; - - // Make updates for the unlabeled features. - if (has_unlabeled_features) { - const BinaryFeatures &part_features = - semantic_features->GetPartFeatures(r); - parameters->MakeGradientStep(part_features, 0.0, 0, 0.0); - } - - // Make updates for the labeled features. - if ((*parts)[r]->type() == SEMANTICPART_ARC && has_labeled_features) { - // Labeled arcs will be treated by looking at the unlabeled arcs and - // conjoining with the label. - CHECK(has_labeled_features); - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - SemanticPartArc *arc = static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->FindLabeledArcs(arc->predicate(), - arc->argument(), - arc->sense()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int index_part = index_labeled_parts[k]; - CHECK_GE(index_part, 0); - CHECK_LT(index_part, parts->size()); - SemanticPartLabeledArc *labeled_arc = - static_cast((*parts)[index_part]); - CHECK(labeled_arc != NULL); - parameters->MakeLabelGradientStep(part_features, 0.0, 0, - labeled_arc->role(), 0.0); - } - } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && - has_labeled_features) { - // Labeled siblings will be treated by looking at the unlabeled ones and - // conjoining with the label. - CHECK(GetSemanticOptions()->labeled()); - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - SemanticPartSibling *sibling = - static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->GetLabeledParts(r); - vector bigram_labels(index_labeled_parts.size()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int index_part = index_labeled_parts[k]; - CHECK_GE(index_part, 0); - CHECK_LT(index_part, parts->size()); - SemanticPartLabeledSibling *labeled_sibling = - static_cast( - (*parts)[index_part]); - CHECK(labeled_sibling != NULL); - int bigram_label = semantic_dictionary->GetRoleBigramLabel( - labeled_sibling->first_role(), - labeled_sibling->second_role()); - parameters->MakeLabelGradientStep(part_features, 0.0, 0, - bigram_label, 0.0); - } - } - } -} - -void SemanticPipe::MakeFeatureDifference(Parts *parts, - Features *features, - const vector &gold_output, - const vector &predicted_output, - FeatureVector *difference) { - SemanticParts *semantic_parts = static_cast(parts); - SemanticDictionary *semantic_dictionary = - static_cast(dictionary_); - SemanticFeatures *semantic_features = - static_cast(features); - - for (int r = 0; r < parts->size(); ++r) { - bool has_unlabeled_features = - (semantic_features->GetNumPartFeatures(r) > 0); - bool has_labeled_features = - (semantic_features->GetNumLabeledPartFeatures(r) > 0); - - if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; - if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; - - // Compute feature difference for the unlabeled features. - if (has_unlabeled_features) { - if (predicted_output[r] != gold_output[r]) { - const BinaryFeatures &part_features = - semantic_features->GetPartFeatures(r); - for (int j = 0; j < part_features.size(); ++j) { - difference->mutable_weights()->Add(part_features[j], - predicted_output[r] - - gold_output[r]); - } - } - } - - // Make updates for the labeled features. - if ((*parts)[r]->type() == SEMANTICPART_ARC && has_labeled_features) { - // Labeled arcs will be treated by looking at the unlabeled arcs and - // conjoining with the label. - CHECK(has_labeled_features); - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - SemanticPartArc *arc = static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->FindLabeledArcs(arc->predicate(), - arc->argument(), - arc->sense()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int index_part = index_labeled_parts[k]; - CHECK_GE(index_part, 0); - CHECK_LT(index_part, parts->size()); - SemanticPartLabeledArc *labeled_arc = - static_cast((*parts)[index_part]); - CHECK(labeled_arc != NULL); - double value = predicted_output[index_part] - gold_output[index_part]; - if (value != 0.0) { - for (int j = 0; j < part_features.size(); ++j) { - difference->mutable_labeled_weights()->Add(part_features[j], - labeled_arc->role(), - value); - } - } - } - } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && - has_labeled_features) { - // Labeled siblings will be treated by looking at the unlabeled ones and - // conjoining with the label. - CHECK(GetSemanticOptions()->labeled()); - const BinaryFeatures &part_features = - semantic_features->GetLabeledPartFeatures(r); - SemanticPartSibling *sibling = - static_cast((*parts)[r]); - const vector &index_labeled_parts = - semantic_parts->GetLabeledParts(r); - vector bigram_labels(index_labeled_parts.size()); - for (int k = 0; k < index_labeled_parts.size(); ++k) { - int index_part = index_labeled_parts[k]; - CHECK_GE(index_part, 0); - CHECK_LT(index_part, parts->size()); - SemanticPartLabeledSibling *labeled_sibling = - static_cast( - (*parts)[index_part]); - CHECK(labeled_sibling != NULL); - int bigram_label = semantic_dictionary->GetRoleBigramLabel( - labeled_sibling->first_role(), - labeled_sibling->second_role()); - double value = predicted_output[index_part] - gold_output[index_part]; - if (value != 0.0) { - for (int j = 0; j < part_features.size(); ++j) { - difference->mutable_labeled_weights()->Add(part_features[j], - bigram_label, - value); - } - } - } - } - } -} - -void SemanticPipe::MakeParts(Instance *instance, - Parts *parts, - vector *gold_outputs) { - int sentence_length = - static_cast(instance)->size(); - SemanticParts *semantic_parts = static_cast(parts); - semantic_parts->Initialize(); - bool make_gold = (gold_outputs != NULL); - if (make_gold) gold_outputs->clear(); - - if (train_pruner_) { - // For the pruner, make only unlabeled arc-factored and predicate parts and - // compute indices. - MakePartsBasic(instance, false, parts, gold_outputs); - semantic_parts->BuildOffsets(); - semantic_parts->BuildIndices(sentence_length, false); - } else { - // Make arc-factored and predicate parts and compute indices. - MakePartsBasic(instance, parts, gold_outputs); - semantic_parts->BuildOffsets(); - semantic_parts->BuildIndices(sentence_length, - GetSemanticOptions()->labeled()); - - // Make global parts. - MakePartsGlobal(instance, parts, gold_outputs); - semantic_parts->BuildOffsets(); - } -} - -void SemanticPipe::MakePartsBasic(Instance *instance, - Parts *parts, - vector *gold_outputs) { - int sentence_length = - static_cast(instance)->size(); - SemanticParts *semantic_parts = static_cast(parts); - - MakePartsBasic(instance, false, parts, gold_outputs); - semantic_parts->BuildOffsets(); - semantic_parts->BuildIndices(sentence_length, false); - - // Prune using a basic first-order model. - if (GetSemanticOptions()->prune_basic()) { - if (options_->train()) { - Prune(instance, parts, gold_outputs, true); - } else { - Prune(instance, parts, gold_outputs, false); - } - semantic_parts->BuildOffsets(); - semantic_parts->BuildIndices(sentence_length, false); - } - - if (GetSemanticOptions()->labeled()) { - MakePartsBasic(instance, true, parts, gold_outputs); - } -} - -void SemanticPipe::MakePartsBasic(Instance *instance, - bool add_labeled_parts, - Parts *parts, - vector *gold_outputs) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); - SemanticOptions *semantic_options = GetSemanticOptions(); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - bool prune_labels = semantic_options->prune_labels(); - bool prune_labels_with_relation_paths = - semantic_options->prune_labels_with_relation_paths(); - bool prune_labels_with_senses = semantic_options->prune_labels_with_senses(); - bool prune_distances = semantic_options->prune_distances(); - bool allow_self_loops = semantic_options->allow_self_loops(); - bool allow_root_predicate = semantic_options->allow_root_predicate(); - bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); - bool use_predicate_senses = semantic_options->use_predicate_senses(); - vector allowed_labels; - - if (add_labeled_parts && !prune_labels) { - allowed_labels.resize(semantic_dictionary->GetRoleAlphabet().size()); - for (int i = 0; i < allowed_labels.size(); ++i) { - allowed_labels[i] = i; - } - } - - // Add predicate parts. - int num_parts_initial = semantic_parts->size(); - if (!add_labeled_parts) { - for (int p = 0; p < sentence_length; ++p) { - if (p == 0 && !allow_root_predicate) continue; - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - for (int s = 0; s < predicates->size(); ++s) { - Part *part = semantic_parts->CreatePartPredicate(p, s); - semantic_parts->AddPart(part); - if (make_gold) { - bool is_gold = false; - int k = sentence->FindPredicate(p); - if (k >= 0) { - int predicate_id = sentence->GetPredicateId(k); - if (!use_predicate_senses) { - CHECK_EQ((*predicates)[s]->id(), PREDICATE_UNKNOWN); - } - if (predicate_id < 0 || (*predicates)[s]->id() == predicate_id) { - is_gold = true; - } - } - if (is_gold) { - gold_outputs->push_back(1.0); - } else { - gold_outputs->push_back(0.0); - } - } - } - } - - // Compute offsets for predicate parts. - semantic_parts->SetOffsetPredicate(num_parts_initial, - semantic_parts->size() - num_parts_initial); - } - - // Add unlabeled/labeled arc parts. - num_parts_initial = semantic_parts->size(); - for (int p = 0; p < sentence_length; ++p) { - if (p == 0 && !allow_root_predicate) continue; - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - for (int a = 1; a < sentence_length; ++a) { - if (!allow_self_loops && p == a) continue; - for (int s = 0; s < predicates->size(); ++s) { - int arc_index = -1; - if (add_labeled_parts) { - // If no unlabeled arc is there, just skip it. - // This happens if that arc was pruned out. - arc_index = semantic_parts->FindArc(p, a, s); - if (0 > arc_index) { - continue; - } - } else { - if (prune_distances) { - int predicate_pos_id = sentence->GetPosId(p); - int argument_pos_id = sentence->GetPosId(a); - if (p < a) { - // Right attachment. - if (a - p > semantic_dictionary->GetMaximumRightDistance - (predicate_pos_id, argument_pos_id)) continue; - } else { - // Left attachment. - if (p - a > semantic_dictionary->GetMaximumLeftDistance - (predicate_pos_id, argument_pos_id)) continue; - } - } - } - - if (prune_labels_with_relation_paths) { - int relation_path_id = sentence->GetRelationPathId(p, a); - allowed_labels.clear(); - if (relation_path_id >= 0 && - relation_path_id < semantic_dictionary-> - GetRelationPathAlphabet().size()) { - allowed_labels = semantic_dictionary-> - GetExistingRolesWithRelationPath(relation_path_id); - //LOG(INFO) << "Path: " << relation_path_id << " Roles: " << allowed_labels.size(); - } - set label_set; - for (int m = 0; m < allowed_labels.size(); ++m) { - if (!prune_labels_with_senses || - (*predicates)[s]->HasRole(allowed_labels[m])) { - label_set.insert(allowed_labels[m]); - } - } - allowed_labels.clear(); - for (set::iterator it = label_set.begin(); - it != label_set.end(); ++it) { - allowed_labels.push_back(*it); - } - if (!add_labeled_parts && allowed_labels.empty()) { - continue; - } - } else if (prune_labels) { - // TODO: allow both kinds of label pruning simultaneously? - int predicate_pos_id = sentence->GetPosId(p); - int argument_pos_id = sentence->GetPosId(a); - allowed_labels.clear(); - allowed_labels = semantic_dictionary-> - GetExistingRoles(predicate_pos_id, argument_pos_id); - set label_set; - for (int m = 0; m < allowed_labels.size(); ++m) { - if (!prune_labels_with_senses || - (*predicates)[s]->HasRole(allowed_labels[m])) { - label_set.insert(allowed_labels[m]); - } - } - allowed_labels.clear(); - for (set::iterator it = label_set.begin(); - it != label_set.end(); ++it) { - allowed_labels.push_back(*it); - } - if (!add_labeled_parts && allowed_labels.empty()) { - continue; - } - } - - // Add parts for labeled/unlabeled arcs. - if (add_labeled_parts) { - // If there is no allowed label for this arc, but the unlabeled arc was added, - // then it was forced to be present for some reason (e.g. to maintain connectivity of the - // graph). In that case (which should be pretty rare) consider all the - // possible labels. - if (allowed_labels.empty()) { - allowed_labels.resize(semantic_dictionary->GetRoleAlphabet().size()); - for (int role = 0; role < allowed_labels.size(); ++role) { - allowed_labels[role] = role; - } - } - - for (int m = 0; m < allowed_labels.size(); ++m) { - int role = allowed_labels[m]; - if (prune_labels && prune_labels_with_senses) { - CHECK((*predicates)[s]->HasRole(role)); - } - Part *part = semantic_parts->CreatePartLabeledArc(p, a, s, role); - CHECK_GE(arc_index, 0); - semantic_parts->AddLabeledPart(part, arc_index); - if (make_gold) { - int k = sentence->FindPredicate(p); - int l = sentence->FindArc(p, a); - bool is_gold = false; - - if (k >= 0 && l >= 0) { - int predicate_id = sentence->GetPredicateId(k); - int argument_id = sentence->GetArgumentRoleId(k, l); - if (!use_predicate_senses) { - CHECK_EQ((*predicates)[s]->id(), PREDICATE_UNKNOWN); - } - //if (use_predicate_senses) CHECK_LT(predicate_id, 0); - if ((predicate_id < 0 || - (*predicates)[s]->id() == predicate_id) && - role == argument_id) { - is_gold = true; - } - } - if (is_gold) { - gold_outputs->push_back(1.0); - } else { - gold_outputs->push_back(0.0); - } - } - } - } else { - Part *part = semantic_parts->CreatePartArc(p, a, s); - semantic_parts->AddPart(part); - if (make_gold) { - int k = sentence->FindPredicate(p); - int l = sentence->FindArc(p, a); - bool is_gold = false; - if (k >= 0 && l >= 0) { - int predicate_id = sentence->GetPredicateId(k); - if (!use_predicate_senses) { - CHECK_EQ((*predicates)[s]->id(), PREDICATE_UNKNOWN); - } - if (predicate_id < 0 || (*predicates)[s]->id() == predicate_id) { - is_gold = true; - } - } - if (is_gold) { - gold_outputs->push_back(1.0); - } else { - gold_outputs->push_back(0.0); - } - } - } - } - } - } - - // Compute offsets for labeled/unlabeled arcs. - if (!add_labeled_parts) { - semantic_parts->SetOffsetArc(num_parts_initial, - semantic_parts->size() - num_parts_initial); - } else { - semantic_parts->SetOffsetLabeledArc(num_parts_initial, - semantic_parts->size() - num_parts_initial); - } -} - -void SemanticPipe::MakePartsArbitrarySiblings(Instance *instance, - Parts *parts, - vector *gold_outputs) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); - SemanticOptions *semantic_options = GetSemanticOptions(); - //bool allow_self_loops = semantic_options->allow_self_loops(); - bool allow_root_predicate = semantic_options->allow_root_predicate(); - bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); - bool use_predicate_senses = semantic_options->use_predicate_senses(); - - // Siblings: (p,s,a1) and (p,s,a2). - for (int p = 0; p < sentence_length; ++p) { - if (p == 0 && !allow_root_predicate) continue; - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - for (int s = 0; s < predicates->size(); ++s) { - for (int a1 = 1; a1 < sentence_length; ++a1) { - int r1 = semantic_parts->FindArc(p, a1, s); - if (r1 < 0) continue; - for (int a2 = a1 + 1; a2 < sentence_length; ++a2) { - int r2 = semantic_parts->FindArc(p, a2, s); - if (r2 < 0) continue; - Part *part = semantic_parts->CreatePartSibling(p, s, a1, a2); - semantic_parts->AddPart(part); - if (make_gold) { - // Logical AND of the two individual arcs. - gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); - } - } - } - } - } -} - -void SemanticPipe::MakePartsLabeledArbitrarySiblings(Instance *instance, - Parts *parts, - vector *gold_outputs) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); - SemanticOptions *semantic_options = GetSemanticOptions(); - - int offset, size; - semantic_parts->GetOffsetSibling(&offset, &size); - for (int r = offset; r < offset + size; ++r) { - SemanticPartSibling *sibling = - static_cast((*semantic_parts)[r]); - int p = sibling->predicate(); - int s = sibling->sense(); - int a1 = sibling->first_argument(); - int a2 = sibling->second_argument(); - const vector &labeled_first_arc_indices = - semantic_parts->FindLabeledArcs(p, a1, s); - const vector &labeled_second_arc_indices = - semantic_parts->FindLabeledArcs(p, a2, s); - for (int k = 0; k < labeled_first_arc_indices.size(); ++k) { - int r1 = labeled_first_arc_indices[k]; - SemanticPartLabeledArc *first_labeled_arc = - static_cast((*semantic_parts)[r1]); - int first_role = first_labeled_arc->role(); - for (int l = 0; l < labeled_second_arc_indices.size(); ++l) { - int r2 = labeled_second_arc_indices[l]; - SemanticPartLabeledArc *second_labeled_arc = - static_cast((*semantic_parts)[r2]); - int second_role = second_labeled_arc->role(); - // To keep the number of parts manageable, only create parts for: - // - same role (a1 == a2); - // - frequent role pairs. - if (first_role != second_role && - !semantic_dictionary->IsFrequentRolePair(first_role, second_role)) { - continue; - } - Part *part = semantic_parts->CreatePartLabeledSibling(p, s, a1, a2, - first_role, - second_role); - semantic_parts->AddLabeledPart(part, r); - if (make_gold) { - // Logical AND of the two individual labeled arcs. - gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); - } - } - } - } -} - -void SemanticPipe::MakePartsConsecutiveSiblings(Instance *instance, - Parts *parts, - vector *gold_outputs) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); - SemanticOptions *semantic_options = GetSemanticOptions(); - //bool allow_self_loops = semantic_options->allow_self_loops(); - bool allow_root_predicate = semantic_options->allow_root_predicate(); - bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); - bool use_predicate_senses = semantic_options->use_predicate_senses(); - - // Find the predicate parts (necessary to identify the gold predicate senses). - // TODO: Replace this by semantic_parts->GetPredicateSenses(p) or something? - int offset_predicate_parts, num_predicate_parts; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - vector > predicate_part_indices(sentence_length); - for (int r = 0; r < num_predicate_parts; ++r) { - SemanticPartPredicate* predicate_part = - static_cast((*parts)[offset_predicate_parts + r]); - predicate_part_indices[predicate_part->predicate()]. - push_back(offset_predicate_parts + r); - } - - // Consecutive siblings: (p,s,a1) and (p,s,a2). - for (int p = 0; p < sentence_length; ++p) { - if (p == 0 && !allow_root_predicate) continue; - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - //const vector &senses = semantic_parts->GetSenses(p); - //CHECK_EQ(senses.size(), predicates->size()); - for (int s = 0; s < predicates->size(); ++s) { - bool sense_active; - bool first_arc_active; - bool second_arc_active = false; - bool arc_between; - - // Check if this is the correct sense. - if (make_gold) { - //int r = senses[s]; - int r = -1; - for (int k = 0; k < predicate_part_indices[p].size(); ++k) { - r = predicate_part_indices[p][k]; - SemanticPartPredicate* predicate_part = - static_cast((*parts)[r]); - if (predicate_part->sense() == s) break; - } - CHECK_GE(r, 0); - if (r >= 0 && NEARLY_EQ_TOL((*gold_outputs)[r], 1.0, 1e-9)) { - sense_active = true; - } else { - sense_active = false; - } - } - - // Right side. - // Allow self loops (a1 = p). We use a1 = p-1 to denote the special case - // in which a2 is the first argument. - for (int a1 = p - 1; a1 < sentence_length; ++a1) { - int r1 = -1; - if (a1 >= p) { - r1 = semantic_parts->FindArc(p, a1, s); - if (r1 < 0) continue; - } - - if (make_gold) { - // Check if the first arc is active. - if (a1 < p || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { - first_arc_active = true; - } else { - first_arc_active = false; - } - arc_between = false; - } - - for (int a2 = a1 + 1; a2 <= sentence_length; ++a2) { - int r2 = -1; - if (a2 < sentence_length) { - r2 = semantic_parts->FindArc(p, a2, s); - if (r2 < 0) continue; - } - if (make_gold) { - // Check if the second arc is active. - if (a2 == sentence_length || - NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { - second_arc_active = true; - } else { - second_arc_active = false; - } - } - - Part *part = (a1 >= p) ? - semantic_parts->CreatePartConsecutiveSibling(p, s, a1, a2) : - semantic_parts->CreatePartConsecutiveSibling(p, s, -1, a2); - semantic_parts->AddPart(part); - - if (make_gold) { - double value = 0.0; - if (sense_active && first_arc_active && second_arc_active && - !arc_between) { - value = 1.0; - arc_between = true; - } - gold_outputs->push_back(value); - } - } - } - - // Left side. - // NOTE: Self loops (a1 = p) are disabled on the left side, to prevent - // having repeated parts. We use a1 = p+1 to denote the special case - // in which a2 is the first argument. - for (int a1 = p + 1; a1 >= 0; --a1) { - int r1 = -1; - if (a1 <= p) { - r1 = semantic_parts->FindArc(p, a1, s); - if (r1 < 0) continue; - } - if (a1 == p) continue; // See NOTE above. - - if (make_gold) { - // Check if the first arc is active. - if (a1 > p || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { - first_arc_active = true; - } else { - first_arc_active = false; - } - arc_between = false; - } - - for (int a2 = a1 - 1; a2 >= -1; --a2) { - int r2 = -1; - if (a2 > -1) { - r2 = semantic_parts->FindArc(p, a2, s); - if (r2 < 0) continue; - } - if (a2 == p) continue; // See NOTE above. - - if (make_gold) { - // Check if the second arc is active. - if (a2 == -1 || - NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { - second_arc_active = true; - } else { - second_arc_active = false; - } - } - - Part *part = (a1 <= p) ? - semantic_parts->CreatePartConsecutiveSibling(p, s, a1, a2) : - semantic_parts->CreatePartConsecutiveSibling(p, s, -1, a2); - semantic_parts->AddPart(part); - - if (make_gold) { - double value = 0.0; - if (sense_active && first_arc_active && second_arc_active && - !arc_between) { - value = 1.0; - arc_between = true; - } - gold_outputs->push_back(value); - } - } - } - } - } -} - -void SemanticPipe::MakePartsGrandparents(Instance *instance, - Parts *parts, - vector *gold_outputs) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); - SemanticOptions *semantic_options = GetSemanticOptions(); - //bool allow_self_loops = semantic_options->allow_self_loops(); - bool allow_root_predicate = semantic_options->allow_root_predicate(); - bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); - bool use_predicate_senses = semantic_options->use_predicate_senses(); - - // Grandparents: (g,t,p) and (p,s,a). - for (int g = 0; g < sentence_length; ++g) { - if (g == 0 && !allow_root_predicate) continue; - int lemma_id_g = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id_g = sentence->GetLemmaId(g); - CHECK_GE(lemma_id_g, 0); - } - const vector *predicates_g = - &semantic_dictionary->GetLemmaPredicates(lemma_id_g); - if (predicates_g->size() == 0 && allow_unseen_predicates) { - predicates_g = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - for (int t = 0; t < predicates_g->size(); ++t) { - for (int p = 1; p < sentence_length; ++p) { - int r1 = semantic_parts->FindArc(g, p, t); - if (r1 < 0) continue; - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - for (int s = 0; s < predicates->size(); ++s) { - for (int a = 1; a < sentence_length; ++a) { - int r2 = semantic_parts->FindArc(p, a, s); - if (r2 < 0) continue; - Part *part = semantic_parts->CreatePartGrandparent(g, t, p, s, a); - semantic_parts->AddPart(part); - if (make_gold) { - // Logical AND of the two individual arcs. - gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); - } - } - } - } - } - } -} - -void SemanticPipe::MakePartsCoparents(Instance *instance, - Parts *parts, - vector *gold_outputs) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); - SemanticOptions *semantic_options = GetSemanticOptions(); - //bool allow_self_loops = semantic_options->allow_self_loops(); - bool allow_root_predicate = semantic_options->allow_root_predicate(); - bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); - bool use_predicate_senses = semantic_options->use_predicate_senses(); - - // Co-parents: (p1,s1,a) and (p2,s2,a). - // First predicate. - for (int p1 = 0; p1 < sentence_length; ++p1) { - if (p1 == 0 && !allow_root_predicate) continue; - int lemma_id_p1 = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id_p1 = sentence->GetLemmaId(p1); - CHECK_GE(lemma_id_p1, 0); - } - const vector *predicates_p1 = - &semantic_dictionary->GetLemmaPredicates(lemma_id_p1); - if (predicates_p1->size() == 0 && allow_unseen_predicates) { - predicates_p1 = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - for (int s1 = 0; s1 < predicates_p1->size(); ++s1) { - // Second predicate. - for (int p2 = p1 + 1; p2 < sentence_length; ++p2) { - int lemma_id_p2 = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id_p2 = sentence->GetLemmaId(p2); - CHECK_GE(lemma_id_p2, 0); - } - const vector *predicates_p2 = - &semantic_dictionary->GetLemmaPredicates(lemma_id_p2); - if (predicates_p2->size() == 0 && allow_unseen_predicates) { - predicates_p2 = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - for (int s2 = 0; s2 < predicates_p2->size(); ++s2) { - // Common argument. - for (int a = 1; a < sentence_length; ++a) { - int r1 = semantic_parts->FindArc(p1, a, s1); - if (r1 < 0) continue; - int r2 = semantic_parts->FindArc(p2, a, s2); - if (r2 < 0) continue; - Part *part = semantic_parts->CreatePartCoparent(p1, s1, p2, s2, a); - semantic_parts->AddPart(part); - if (make_gold) { - // Logical AND of the two individual arcs. - gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); - } - } - } - } - } - } -} - -void SemanticPipe::MakePartsConsecutiveCoparents(Instance *instance, - Parts *parts, - vector *gold_outputs) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - int sentence_length = sentence->size(); - bool make_gold = (gold_outputs != NULL); - SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); - SemanticOptions *semantic_options = GetSemanticOptions(); - //bool allow_self_loops = semantic_options->allow_self_loops(); - bool allow_root_predicate = semantic_options->allow_root_predicate(); - bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); - bool use_predicate_senses = semantic_options->use_predicate_senses(); - - // Consecutive co-parents: (p1,s1,a) and (p2,s2,a). - for (int a = 1; a < sentence_length; ++a) { - bool first_arc_active; - bool second_arc_active = false; - bool arc_between; - - // Right side. - // Allow self loops (p1 = a). We use p1 = a-1 to denote the special case - // in which p2 is the first predicate. - for (int p1 = a - 1; p1 < sentence_length; ++p1) { - int num_senses1; - if (p1 < a) { - // If p1 = a-1, pretend there is a single sense (s1=0). - num_senses1 = 1; - } else { - //const vector &senses = semantic_parts->GetSenses(p); - //CHECK_EQ(senses.size(), predicates->size()); - if (p1 == 0 && !allow_root_predicate) continue; // Never happens. - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p1); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - num_senses1 = predicates->size(); - } - - for (int s1 = 0; s1 < num_senses1; ++s1) { - int r1 = -1; - if (p1 >= a) { - r1 = semantic_parts->FindArc(p1, a, s1); - if (r1 < 0) continue; - } - - if (make_gold) { - // Check if the first arc is active. - if (p1 < a || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { - first_arc_active = true; - } else { - first_arc_active = false; - } - arc_between = false; - } - - for (int p2 = p1 + 1; p2 <= sentence_length; ++p2) { - int num_senses2; - if (p2 == sentence_length) { - // If p2 = sentence_length, pretend there is a single sense (s2=0). - num_senses2 = 1; - } else { - //const vector &senses = semantic_parts->GetSenses(p); - //CHECK_EQ(senses.size(), predicates->size()); - if (p2 == 0 && !allow_root_predicate) continue; // Never happens. - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p2); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - num_senses2 = predicates->size(); - } - - for (int s2 = 0; s2 < num_senses2; ++s2) { - int r2 = -1; - if (p2 < sentence_length) { - r2 = semantic_parts->FindArc(p2, a, s2); - if (r2 < 0) continue; - } - if (make_gold) { - // Check if the second arc is active. - if (p2 == sentence_length || - NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { - second_arc_active = true; - } else { - second_arc_active = false; - } - } - - Part *part = (p1 >= a) ? - semantic_parts->CreatePartConsecutiveCoparent(p1, s1, p2, s2, a) : - semantic_parts->CreatePartConsecutiveCoparent(-1, 0, p2, s2, a); - semantic_parts->AddPart(part); - - if (make_gold) { - double value = 0.0; - if (first_arc_active && second_arc_active && !arc_between) { - value = 1.0; - arc_between = true; - } - gold_outputs->push_back(value); - } - } - } - } - } - - // Left side. - // NOTE: Self loops (p1 = a) are disabled on the left side, to prevent - // having repeated parts. We use p1 = a+1 to denote the special case - // in which p2 is the first predicate. - for (int p1 = a + 1; p1 >= 0; --p1) { - int num_senses1; - if (p1 > a) { - // If p1 = a+1, pretend there is a single sense (s1=0). - num_senses1 = 1; - } else if (p1 == a) { // See NOTE above. - continue; - } else { - //const vector &senses = semantic_parts->GetSenses(p); - //CHECK_EQ(senses.size(), predicates->size()); - if (p1 == 0 && !allow_root_predicate) continue; - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p1); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - num_senses1 = predicates->size(); - } - - for (int s1 = 0; s1 < num_senses1; ++s1) { - int r1 = -1; - if (p1 <= a) { - r1 = semantic_parts->FindArc(p1, a, s1); - if (r1 < 0) continue; - } - if (p1 == a) continue; // See NOTE above. - - if (make_gold) { - // Check if the first arc is active. - if (p1 > a || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { - first_arc_active = true; - } else { - first_arc_active = false; - } - arc_between = false; - } - - for (int p2 = p1 - 1; p2 >= -1; --p2) { - int num_senses2; - if (p2 == -1) { - // If p2 = -1, pretend there is a single sense (s2=0). - num_senses2 = 1; - } else if (p2 == a) { // See NOTE above. - continue; - } else { - //const vector &senses = semantic_parts->GetSenses(p); - //CHECK_EQ(senses.size(), predicates->size()); - if (p2 == 0 && !allow_root_predicate) continue; - int lemma_id = TOKEN_UNKNOWN; - if (use_predicate_senses) { - lemma_id = sentence->GetLemmaId(p2); - CHECK_GE(lemma_id, 0); - } - const vector *predicates = - &semantic_dictionary->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && allow_unseen_predicates) { - predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); - } - num_senses2 = predicates->size(); - } - - for (int s2 = 0; s2 < num_senses2; ++s2) { - int r2 = -1; - if (p2 > -1) { - r2 = semantic_parts->FindArc(p2, a, s2); - if (r2 < 0) continue; - } - if (p2 == a) continue; // See NOTE above. - - if (make_gold) { - // Check if the second arc is active. - if (p2 == -1 || - NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { - second_arc_active = true; - } else { - second_arc_active = false; - } - } - - Part *part = (p1 <= a) ? - semantic_parts->CreatePartConsecutiveCoparent(p1, s1, p2, s2, a) : - semantic_parts->CreatePartConsecutiveCoparent(-1, 0, p2, s2, a); - semantic_parts->AddPart(part); - - if (make_gold) { - double value = 0.0; - if (first_arc_active && second_arc_active && !arc_between) { - value = 1.0; - arc_between = true; - } - gold_outputs->push_back(value); - } - } - } - } - } - } -} - -void SemanticPipe::MakePartsGlobal(Instance *instance, - Parts *parts, - vector *gold_outputs) { - SemanticOptions *semantic_options = GetSemanticOptions(); - SemanticParts *semantic_parts = static_cast(parts); - - int num_parts_initial = semantic_parts->size(); - if (semantic_options->use_arbitrary_siblings()) { - MakePartsArbitrarySiblings(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetSibling(num_parts_initial, - semantic_parts->size() - num_parts_initial); - //LOG(INFO) << "Num siblings: " << semantic_parts->size() - num_parts_initial; - - num_parts_initial = semantic_parts->size(); - if (semantic_options->use_arbitrary_siblings() && - FLAGS_use_labeled_sibling_features) { - MakePartsLabeledArbitrarySiblings(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetLabeledSibling( - num_parts_initial, semantic_parts->size() - num_parts_initial); - //LOG(INFO) << "Num labeled siblings: " << semantic_parts->size() - num_parts_initial; - - num_parts_initial = semantic_parts->size(); - if (semantic_options->use_consecutive_siblings()) { - MakePartsConsecutiveSiblings(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetConsecutiveSibling(num_parts_initial, - semantic_parts->size() - num_parts_initial); - - num_parts_initial = semantic_parts->size(); - if (semantic_options->use_grandparents()) { - MakePartsGrandparents(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetGrandparent(num_parts_initial, - semantic_parts->size() - num_parts_initial); - - num_parts_initial = semantic_parts->size(); - if (semantic_options->use_coparents()) { - MakePartsCoparents(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetCoparent(num_parts_initial, - semantic_parts->size() - num_parts_initial); - - num_parts_initial = semantic_parts->size(); - if (semantic_options->use_consecutive_coparents()) { - MakePartsConsecutiveCoparents(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetConsecutiveCoparent(num_parts_initial, - semantic_parts->size() - num_parts_initial); - -#if 0 - num_parts_initial = semantic_parts->size(); - if (semantic_options->use_grandsiblings()) { - MakePartsGrandSiblings(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetGrandSiblings(num_parts_initial, - semantic_parts->size() - num_parts_initial); - - num_parts_initial = semantic_parts->size(); - if (semantic_options->use_trisiblings()) { - MakePartsTriSiblings(instance, parts, gold_outputs); - } - semantic_parts->SetOffsetTriSiblings(num_parts_initial, - semantic_parts->size() - num_parts_initial); -#endif -} - -void SemanticPipe::MakeSelectedFeatures(Instance *instance, - Parts *parts, - bool pruner, - const vector& selected_parts, - Features *features) { - SemanticInstanceNumeric *sentence = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - SemanticFeatures *semantic_features = - static_cast(features); - int sentence_length = sentence->size(); - - semantic_features->Initialize(instance, parts); - - // Build features for predicates. - int offset, size; - semantic_parts->GetOffsetPredicate(&offset, &size); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartPredicate *predicate_part = - static_cast((*semantic_parts)[r]); - // Get the predicate id for this part. - // TODO(atm): store this somewhere, so that we don't need to recompute this - // all the time. - int lemma_id = TOKEN_UNKNOWN; - if (GetSemanticOptions()->use_predicate_senses()) { - lemma_id = sentence->GetLemmaId(predicate_part->predicate()); - } - const vector *predicates = - &GetSemanticDictionary()->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && - GetSemanticOptions()->allow_unseen_predicates()) { - predicates = &GetSemanticDictionary()->GetLemmaPredicates(TOKEN_UNKNOWN); - } - int predicate_id = (*predicates)[predicate_part->sense()]->id(); - // Add the predicate features. - semantic_features->AddPredicateFeatures(sentence, r, - predicate_part->predicate(), - predicate_id); - } - - // Even in the case of labeled parsing, build features for unlabeled arcs - // only. They will later be conjoined with the labels. - semantic_parts->GetOffsetArc(&offset, &size); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartArc *arc = - static_cast((*semantic_parts)[r]); - // Get the predicate id for this part. - // TODO(atm): store this somewhere, so that we don't need to recompute this - // all the time. Maybe store this directly in arc->sense()? - int lemma_id = TOKEN_UNKNOWN; - if (GetSemanticOptions()->use_predicate_senses()) { - lemma_id = sentence->GetLemmaId(arc->predicate()); - } - const vector *predicates = - &GetSemanticDictionary()->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && - GetSemanticOptions()->allow_unseen_predicates()) { - predicates = &GetSemanticDictionary()->GetLemmaPredicates(TOKEN_UNKNOWN); - } - int predicate_id = (*predicates)[arc->sense()]->id(); - if (!pruner && GetSemanticOptions()->labeled()) { - semantic_features->AddLabeledArcFeatures(sentence, r, arc->predicate(), - arc->argument(), predicate_id); - if (!FLAGS_use_only_labeled_arc_features) { - semantic_features->AddArcFeatures(sentence, r, arc->predicate(), - arc->argument(), predicate_id); - } - } else { - semantic_features->AddArcFeatures(sentence, r, arc->predicate(), - arc->argument(), predicate_id); - } - } - - // Build features for arbitrary siblings. - semantic_parts->GetOffsetSibling(&offset, &size); - if (pruner) CHECK_EQ(size, 0); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartSibling *part = - static_cast((*semantic_parts)[r]); - CHECK_EQ(part->type(), SEMANTICPART_SIBLING); - if (FLAGS_use_labeled_sibling_features) { - semantic_features-> - AddArbitraryLabeledSiblingFeatures(sentence, r, - part->predicate(), - part->sense(), - part->first_argument(), - part->second_argument()); - if (!FLAGS_use_only_labeled_sibling_features) { - semantic_features->AddArbitrarySiblingFeatures(sentence, r, - part->predicate(), - part->sense(), - part->first_argument(), - part->second_argument()); - } - } else { - semantic_features->AddArbitrarySiblingFeatures(sentence, r, - part->predicate(), - part->sense(), - part->first_argument(), - part->second_argument()); - } - } - - // Build features for consecutive siblings. - semantic_parts->GetOffsetConsecutiveSibling(&offset, &size); - if (pruner) CHECK_EQ(size, 0); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartConsecutiveSibling *part = - static_cast((*semantic_parts)[r]); - CHECK_EQ(part->type(), SEMANTICPART_CONSECUTIVESIBLING); - semantic_features->AddConsecutiveSiblingFeatures( - sentence, r, - part->predicate(), - part->sense(), - part->first_argument(), - part->second_argument()); - } - - // Build features for grandparents. - semantic_parts->GetOffsetGrandparent(&offset, &size); - if (pruner) CHECK_EQ(size, 0); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartGrandparent *part = - static_cast((*semantic_parts)[r]); - CHECK_EQ(part->type(), SEMANTICPART_GRANDPARENT); - semantic_features->AddGrandparentFeatures(sentence, r, - part->grandparent_predicate(), - part->grandparent_sense(), - part->predicate(), - part->sense(), - part->argument()); - } - - // Build features for co-parents. - semantic_parts->GetOffsetCoparent(&offset, &size); - if (pruner) CHECK_EQ(size, 0); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartCoparent *part = - static_cast((*semantic_parts)[r]); - CHECK_EQ(part->type(), SEMANTICPART_COPARENT); - semantic_features->AddCoparentFeatures(sentence, r, - part->first_predicate(), - part->first_sense(), - part->second_predicate(), - part->second_sense(), - part->argument()); - } - - // Build features for consecutive co-parents. - semantic_parts->GetOffsetConsecutiveCoparent(&offset, &size); - if (pruner) CHECK_EQ(size, 0); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartConsecutiveCoparent *part = - static_cast((*semantic_parts)[r]); - CHECK_EQ(part->type(), SEMANTICPART_CONSECUTIVECOPARENT); - semantic_features->AddConsecutiveCoparentFeatures( - sentence, r, - part->first_predicate(), - part->first_sense(), - part->second_predicate(), - part->second_sense(), - part->argument()); - } - -#if 0 - // Build features for grand-siblings. - dependency_parts->GetOffsetGrandSibl(&offset, &size); - if (pruner) CHECK_EQ(size, 0); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartGrandSibl *part = - static_cast((*dependency_parts)[r]); - CHECK_EQ(part->type(), DEPENDENCYPART_GRANDSIBL); - CHECK_LE(part->modifier(), sentence_length); - CHECK_LE(part->sibling(), sentence_length); - dependency_features->AddGrandSiblingFeatures(sentence, r, - part->grandparent(), - part->head(), - part->modifier(), - part->sibling()); - } - - // Build features for tri-siblings. - dependency_parts->GetOffsetTriSibl(&offset, &size); - if (pruner) CHECK_EQ(size, 0); - for (int r = offset; r < offset + size; ++r) { - if (!selected_parts[r]) continue; - SemanticPartTriSibl *part = - static_cast((*dependency_parts)[r]); - CHECK_EQ(part->type(), DEPENDENCYPART_TRISIBL); - dependency_features->AddTriSiblingFeatures(sentence, r, - part->head(), - part->modifier(), - part->sibling(), - part->other_sibling()); - } - -#endif -} - -// Prune basic parts (arcs and labeled arcs) using a first-order model. -// The vectors of basic parts is given as input, and those elements that are -// to be pruned are deleted from the vector. -// If gold_outputs is not NULL that vector will also be pruned. -void SemanticPipe::Prune(Instance *instance, Parts *parts, - vector *gold_outputs, - bool preserve_gold) { - SemanticParts *semantic_parts = static_cast(parts); - Features *features = CreateFeatures(); - vector scores; - vector predicted_outputs; - - // Make sure gold parts are only preserved at training time. - CHECK(!preserve_gold || options_->train()); - - MakeFeatures(instance, parts, true, features); - ComputeScores(instance, parts, features, true, &scores); - GetSemanticDecoder()->DecodePruner(instance, parts, scores, - &predicted_outputs); - - int offset_predicate_parts, num_predicate_parts; - int offset_arcs, num_arcs; - semantic_parts->GetOffsetPredicate(&offset_predicate_parts, - &num_predicate_parts); - semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); - - double threshold = 0.5; - int r0 = offset_arcs; // Preserve all the predicate parts. - semantic_parts->ClearOffsets(); - semantic_parts->SetOffsetPredicate(offset_predicate_parts, - num_predicate_parts); - for (int r = 0; r < num_arcs; ++r) { - // Preserve gold parts (at training time). - if (predicted_outputs[offset_arcs + r] >= threshold || - (preserve_gold && (*gold_outputs)[offset_arcs + r] >= threshold)) { - (*parts)[r0] = (*parts)[offset_arcs + r]; - semantic_parts-> - SetLabeledParts(r0, semantic_parts->GetLabeledParts(offset_arcs + r)); - if (gold_outputs) { - (*gold_outputs)[r0] = (*gold_outputs)[offset_arcs + r]; - } - ++r0; - } else { - delete (*parts)[offset_arcs + r]; - } - } - - if (gold_outputs) gold_outputs->resize(r0); - semantic_parts->Resize(r0); - semantic_parts->DeleteIndices(); - semantic_parts->SetOffsetArc(offset_arcs, - parts->size() - offset_arcs); - - delete features; -} - -void SemanticPipe::LabelInstance(Parts *parts, - const vector &output, - Instance *instance) { - SemanticParts *semantic_parts = static_cast(parts); - SemanticInstance *semantic_instance = - static_cast(instance); - SemanticDictionary *semantic_dictionary = - static_cast(dictionary_); - //bool allow_root_predicate = GetSemanticOptions()->allow_root_predicate(); - int instance_length = semantic_instance->size(); - double threshold = 0.5; - semantic_instance->ClearPredicates(); - for (int p = 0; p < instance_length; ++p) { - //if (p == 0 && !allow_root_predicate) continue; - const vector &senses = semantic_parts->GetSenses(p); - vector argument_indices; - vector argument_roles; - int predicted_sense = -1; - for (int k = 0; k < senses.size(); k++) { - int s = senses[k]; - for (int a = 1; a < instance_length; ++a) { - if (GetSemanticOptions()->labeled()) { - int r = semantic_parts->FindArc(p, a, s); - if (r < 0) continue; - const vector &labeled_arcs = - semantic_parts->FindLabeledArcs(p, a, s); - for (int l = 0; l < labeled_arcs.size(); ++l) { - int r = labeled_arcs[l]; - if (output[r] > threshold) { - if (predicted_sense != s) { - CHECK_LT(predicted_sense, 0); - predicted_sense = s; - } - argument_indices.push_back(a); - SemanticPartLabeledArc *labeled_arc = - static_cast((*parts)[r]); - string role = - semantic_dictionary->GetRoleName(labeled_arc->role()); - argument_roles.push_back(role); - } - } - } else { - int r = semantic_parts->FindArc(p, a, s); - if (r < 0) continue; - if (output[r] > threshold) { - if (predicted_sense != s) { - CHECK_LT(predicted_sense, 0); - predicted_sense = s; - } - argument_indices.push_back(a); - argument_roles.push_back("ARG"); - } - } - } - } - - if (predicted_sense >= 0) { - int s = predicted_sense; - // Get the predicate id for this part. - // TODO(atm): store this somewhere, so that we don't need to recompute this - // all the time. Maybe store this directly in arc->sense()? - int lemma_id = TOKEN_UNKNOWN; - if (GetSemanticOptions()->use_predicate_senses()) { - lemma_id = semantic_dictionary->GetTokenDictionary()-> - GetLemmaId(semantic_instance->GetLemma(p)); - if (lemma_id < 0) lemma_id = TOKEN_UNKNOWN; - } - const vector *predicates = - &GetSemanticDictionary()->GetLemmaPredicates(lemma_id); - if (predicates->size() == 0 && - GetSemanticOptions()->allow_unseen_predicates()) { - predicates = &GetSemanticDictionary()->GetLemmaPredicates(TOKEN_UNKNOWN); - } - int predicate_id = (*predicates)[s]->id(); - string predicate_name = - semantic_dictionary->GetPredicateName(predicate_id); - semantic_instance->AddPredicate(predicate_name, p, argument_roles, - argument_indices); - } - } -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "SemanticPipe.h" +#include +#include +#include +#include +#ifndef _WIN32 +#include +#else +#include +#endif +using namespace std; + +// Define the current model version and the oldest back-compatible version. +// The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0". +const uint64_t kSemanticParserModelVersion = 200030000; +const uint64_t kOldestCompatibleSemanticParserModelVersion = 200030000; +const uint64_t kSemanticParserModelCheck = 1234567890; + +DEFINE_bool(use_only_labeled_arc_features, true, + "True for not using unlabeled arc features in addition to labeled ones."); +DEFINE_bool(use_only_labeled_sibling_features, false, //true, + "True for not using unlabeled sibling features in addition to labeled ones."); +DEFINE_bool(use_labeled_sibling_features, false, //true, + "True for using labels in sibling features."); + +void SemanticPipe::SaveModel(FILE* fs) { + bool success; + success = WriteUINT64(fs, kSemanticParserModelCheck); + CHECK(success); + success = WriteUINT64(fs, kSemanticParserModelVersion); + CHECK(success); + token_dictionary_->Save(fs); + dependency_dictionary_->Save(fs); + Pipe::SaveModel(fs); + pruner_parameters_->Save(fs); +} + +void SemanticPipe::LoadModel(FILE* fs) { + bool success; + success = ReadUINT64(fs, &model_check_); + CHECK(success); + CHECK_EQ(model_check_, kSemanticParserModelCheck) + << "The model file is too old and not supported anymore."; + success = ReadUINT64(fs, &model_version_); + CHECK(success); + CHECK_GE(model_version_, kOldestCompatibleSemanticParserModelVersion) + << "The model file is too old and not supported anymore."; + delete token_dictionary_; + CreateTokenDictionary(); + static_cast(dictionary_)-> + SetTokenDictionary(token_dictionary_); + token_dictionary_->Load(fs); + CreateDependencyDictionary(); + dependency_dictionary_->SetTokenDictionary(token_dictionary_); + static_cast(dictionary_)-> + SetDependencyDictionary(dependency_dictionary_); + dependency_dictionary_->Load(fs); + Pipe::LoadModel(fs); + pruner_parameters_->Load(fs); +} + +void SemanticPipe::LoadPrunerModel(FILE* fs) { + LOG(INFO) << "Loading pruner model..."; + // This will be ignored but must be passed to the pruner pipe constructor, + // so that when loading the pruner model the actual options are not + // overwritten. + SemanticOptions pruner_options; // = *options_; + SemanticPipe* pipe = new SemanticPipe(&pruner_options); + //SemanticPipe* pipe = new SemanticPipe(options_); + pipe->Initialize(); + pipe->LoadModel(fs); + delete pruner_parameters_; + pruner_parameters_ = pipe->parameters_; + pipe->parameters_ = NULL; + delete pipe; + LOG(INFO) << "Done."; +} + +void SemanticPipe::LoadPrunerModelByName(const string &model_name) { + FILE *fs = fopen(model_name.c_str(), "rb"); + CHECK(fs) << "Could not open pruner model file for reading: " << model_name; + LoadPrunerModel(fs); + fclose(fs); +} + +void SemanticPipe::PreprocessData() { + delete token_dictionary_; + CreateTokenDictionary(); + static_cast(dictionary_)->SetTokenDictionary(token_dictionary_); + static_cast(token_dictionary_)->Initialize(GetSemanticReader()); + delete dependency_dictionary_; + CreateDependencyDictionary(); + dependency_dictionary_->SetTokenDictionary(token_dictionary_); + static_cast(dictionary_)->SetDependencyDictionary(dependency_dictionary_); + dependency_dictionary_->CreateLabelDictionary(GetSemanticReader()); + static_cast(dictionary_)->CreatePredicateRoleDictionaries(GetSemanticReader()); +} + +void SemanticPipe::ComputeScores(Instance *instance, Parts *parts, + Features *features, + bool pruner, + vector *scores) { + Parameters *parameters; + SemanticDictionary *semantic_dictionary = + static_cast(dictionary_); + SemanticFeatures *semantic_features = + static_cast(features); + if (pruner) { + parameters = pruner_parameters_; + } else { + parameters = parameters_; + } + scores->resize(parts->size()); + SemanticParts *semantic_parts = static_cast(parts); + for (int r = 0; r < parts->size(); ++r) { + bool has_unlabeled_features = + (semantic_features->GetNumPartFeatures(r) > 0); + bool has_labeled_features = + (semantic_features->GetNumLabeledPartFeatures(r) > 0); + + if (pruner) CHECK((*parts)[r]->type() == SEMANTICPART_ARC || + (*parts)[r]->type() == SEMANTICPART_PREDICATE); + if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; + if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; + + // Compute scores for the unlabeled features. + if (has_unlabeled_features) { + const BinaryFeatures &part_features = + semantic_features->GetPartFeatures(r); + (*scores)[r] = parameters->ComputeScore(part_features); + } else { + (*scores)[r] = 0.0; + } + + // Compute scores for the labeled features. + if ((*parts)[r]->type() == SEMANTICPART_ARC && !pruner && + GetSemanticOptions()->labeled()) { + // Labeled arcs will be treated by looking at the unlabeled arcs and + // conjoining with the label. + CHECK(has_labeled_features); + SemanticPartArc *arc = static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->FindLabeledArcs(arc->predicate(), + arc->argument(), + arc->sense()); + vector allowed_labels(index_labeled_parts.size()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + CHECK_GE(index_labeled_parts[k], 0); + CHECK_LT(index_labeled_parts[k], parts->size()); + SemanticPartLabeledArc *labeled_arc = + static_cast( + (*parts)[index_labeled_parts[k]]); + CHECK(labeled_arc != NULL); + allowed_labels[k] = labeled_arc->role(); + } + vector label_scores; + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + parameters->ComputeLabelScores(part_features, allowed_labels, + &label_scores); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + (*scores)[index_labeled_parts[k]] = label_scores[k]; + } + } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && + has_labeled_features) { + // Labeled siblings will be treated by looking at the unlabeled ones and + // conjoining with the label. + CHECK(!pruner); + CHECK(GetSemanticOptions()->labeled()); + SemanticPartSibling *sibling = + static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->GetLabeledParts(r); + vector bigram_labels(index_labeled_parts.size()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + CHECK_GE(index_labeled_parts[k], 0); + CHECK_LT(index_labeled_parts[k], parts->size()); + SemanticPartLabeledSibling *labeled_sibling = + static_cast( + (*parts)[index_labeled_parts[k]]); + CHECK(labeled_sibling != NULL); + bigram_labels[k] = semantic_dictionary->GetRoleBigramLabel( + labeled_sibling->first_role(), + labeled_sibling->second_role()); + } + vector label_scores; + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + parameters->ComputeLabelScores(part_features, bigram_labels, + &label_scores); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + (*scores)[index_labeled_parts[k]] = label_scores[k]; + } + } + } +} + +void SemanticPipe::RemoveUnsupportedFeatures(Instance *instance, Parts *parts, + bool pruner, + const vector &selected_parts, + Features *features) { + Parameters *parameters; + SemanticFeatures *semantic_features = + static_cast(features); + if (pruner) { + parameters = pruner_parameters_; + } else { + parameters = parameters_; + } + + for (int r = 0; r < parts->size(); ++r) { + // TODO: Make sure we can do this continue for the labeled parts... + if (!selected_parts[r]) continue; + + bool has_unlabeled_features = + (semantic_features->GetNumPartFeatures(r) > 0); + bool has_labeled_features = + (semantic_features->GetNumLabeledPartFeatures(r) > 0); + + if (pruner) CHECK((*parts)[r]->type() == SEMANTICPART_ARC || + (*parts)[r]->type() == SEMANTICPART_PREDICATE); + + // TODO(atm): I think this is handling the case there can be labeled + // features, but was never tested. + CHECK(!has_labeled_features); + + // Skip labeled arcs, as they use the features from unlabeled arcs. + if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; + if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; + + if (has_unlabeled_features) { + BinaryFeatures *part_features = + semantic_features->GetMutablePartFeatures(r); + int num_supported = 0; + for (int j = 0; j < part_features->size(); ++j) { + if (parameters->Exists((*part_features)[j])) { + (*part_features)[num_supported] = (*part_features)[j]; + ++num_supported; + } + } + part_features->resize(num_supported); + } + + if (has_labeled_features) { + BinaryFeatures *part_features = + semantic_features->GetMutableLabeledPartFeatures(r); + int num_supported = 0; + for (int j = 0; j < part_features->size(); ++j) { + if (parameters->ExistsLabeled((*part_features)[j])) { + (*part_features)[num_supported] = (*part_features)[j]; + ++num_supported; + } + } + part_features->resize(num_supported); + } + } +} + +void SemanticPipe::MakeGradientStep(Parts *parts, + Features *features, + double eta, + int iteration, + const vector &gold_output, + const vector &predicted_output) { + SemanticParts *semantic_parts = static_cast(parts); + SemanticDictionary *semantic_dictionary = + static_cast(dictionary_); + SemanticFeatures *semantic_features = + static_cast(features); + Parameters *parameters = GetTrainingParameters(); + + for (int r = 0; r < parts->size(); ++r) { + bool has_unlabeled_features = + (semantic_features->GetNumPartFeatures(r) > 0); + bool has_labeled_features = + (semantic_features->GetNumLabeledPartFeatures(r) > 0); + + if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; + if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; + + // Make updates for the unlabeled features. + if (has_unlabeled_features) { + if (predicted_output[r] != gold_output[r]) { + const BinaryFeatures &part_features = + semantic_features->GetPartFeatures(r); + parameters->MakeGradientStep(part_features, eta, iteration, + predicted_output[r] - gold_output[r]); + } + } + + // Make updates for the labeled features. + if ((*parts)[r]->type() == SEMANTICPART_ARC && has_labeled_features) { + // Labeled arcs will be treated by looking at the unlabeled arcs and + // conjoining with the label. + CHECK(has_labeled_features); + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + SemanticPartArc *arc = static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->FindLabeledArcs(arc->predicate(), + arc->argument(), + arc->sense()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int index_part = index_labeled_parts[k]; + CHECK_GE(index_part, 0); + CHECK_LT(index_part, parts->size()); + SemanticPartLabeledArc *labeled_arc = + static_cast((*parts)[index_part]); + CHECK(labeled_arc != NULL); + double value = predicted_output[index_part] - gold_output[index_part]; + if (value != 0.0) { + parameters->MakeLabelGradientStep(part_features, eta, iteration, + labeled_arc->role(), + value); + } + } + } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && + has_labeled_features) { + // Labeled siblings will be treated by looking at the unlabeled ones and + // conjoining with the label. + CHECK(GetSemanticOptions()->labeled()); + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + SemanticPartSibling *sibling = + static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->GetLabeledParts(r); + vector bigram_labels(index_labeled_parts.size()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int index_part = index_labeled_parts[k]; + CHECK_GE(index_part, 0); + CHECK_LT(index_part, parts->size()); + SemanticPartLabeledSibling *labeled_sibling = + static_cast( + (*parts)[index_part]); + CHECK(labeled_sibling != NULL); + int bigram_label = semantic_dictionary->GetRoleBigramLabel( + labeled_sibling->first_role(), + labeled_sibling->second_role()); + double value = predicted_output[index_part] - gold_output[index_part]; + if (value != 0.0) { + parameters->MakeLabelGradientStep(part_features, eta, iteration, + bigram_label, value); + } + } + } + } +} + +void SemanticPipe::TouchParameters(Parts *parts, Features *features, + const vector &selected_parts) { + SemanticParts *semantic_parts = static_cast(parts); + SemanticDictionary *semantic_dictionary = + static_cast(dictionary_); + SemanticFeatures *semantic_features = + static_cast(features); + Parameters *parameters = GetTrainingParameters(); + + for (int r = 0; r < parts->size(); ++r) { + // TODO: Make sure we can do this continue for the labeled parts... + if (!selected_parts[r]) continue; + + bool has_unlabeled_features = + (semantic_features->GetNumPartFeatures(r) > 0); + bool has_labeled_features = + (semantic_features->GetNumLabeledPartFeatures(r) > 0); + + if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; + if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; + + // Make updates for the unlabeled features. + if (has_unlabeled_features) { + const BinaryFeatures &part_features = + semantic_features->GetPartFeatures(r); + parameters->MakeGradientStep(part_features, 0.0, 0, 0.0); + } + + // Make updates for the labeled features. + if ((*parts)[r]->type() == SEMANTICPART_ARC && has_labeled_features) { + // Labeled arcs will be treated by looking at the unlabeled arcs and + // conjoining with the label. + CHECK(has_labeled_features); + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + SemanticPartArc *arc = static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->FindLabeledArcs(arc->predicate(), + arc->argument(), + arc->sense()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int index_part = index_labeled_parts[k]; + CHECK_GE(index_part, 0); + CHECK_LT(index_part, parts->size()); + SemanticPartLabeledArc *labeled_arc = + static_cast((*parts)[index_part]); + CHECK(labeled_arc != NULL); + parameters->MakeLabelGradientStep(part_features, 0.0, 0, + labeled_arc->role(), 0.0); + } + } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && + has_labeled_features) { + // Labeled siblings will be treated by looking at the unlabeled ones and + // conjoining with the label. + CHECK(GetSemanticOptions()->labeled()); + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + SemanticPartSibling *sibling = + static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->GetLabeledParts(r); + vector bigram_labels(index_labeled_parts.size()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int index_part = index_labeled_parts[k]; + CHECK_GE(index_part, 0); + CHECK_LT(index_part, parts->size()); + SemanticPartLabeledSibling *labeled_sibling = + static_cast( + (*parts)[index_part]); + CHECK(labeled_sibling != NULL); + int bigram_label = semantic_dictionary->GetRoleBigramLabel( + labeled_sibling->first_role(), + labeled_sibling->second_role()); + parameters->MakeLabelGradientStep(part_features, 0.0, 0, + bigram_label, 0.0); + } + } + } +} + +void SemanticPipe::MakeFeatureDifference(Parts *parts, + Features *features, + const vector &gold_output, + const vector &predicted_output, + FeatureVector *difference) { + SemanticParts *semantic_parts = static_cast(parts); + SemanticDictionary *semantic_dictionary = + static_cast(dictionary_); + SemanticFeatures *semantic_features = + static_cast(features); + + for (int r = 0; r < parts->size(); ++r) { + bool has_unlabeled_features = + (semantic_features->GetNumPartFeatures(r) > 0); + bool has_labeled_features = + (semantic_features->GetNumLabeledPartFeatures(r) > 0); + + if ((*parts)[r]->type() == SEMANTICPART_LABELEDARC) continue; + if ((*parts)[r]->type() == SEMANTICPART_LABELEDSIBLING) continue; + + // Compute feature difference for the unlabeled features. + if (has_unlabeled_features) { + if (predicted_output[r] != gold_output[r]) { + const BinaryFeatures &part_features = + semantic_features->GetPartFeatures(r); + for (int j = 0; j < part_features.size(); ++j) { + difference->mutable_weights()->Add(part_features[j], + predicted_output[r] - + gold_output[r]); + } + } + } + + // Make updates for the labeled features. + if ((*parts)[r]->type() == SEMANTICPART_ARC && has_labeled_features) { + // Labeled arcs will be treated by looking at the unlabeled arcs and + // conjoining with the label. + CHECK(has_labeled_features); + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + SemanticPartArc *arc = static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->FindLabeledArcs(arc->predicate(), + arc->argument(), + arc->sense()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int index_part = index_labeled_parts[k]; + CHECK_GE(index_part, 0); + CHECK_LT(index_part, parts->size()); + SemanticPartLabeledArc *labeled_arc = + static_cast((*parts)[index_part]); + CHECK(labeled_arc != NULL); + double value = predicted_output[index_part] - gold_output[index_part]; + if (value != 0.0) { + for (int j = 0; j < part_features.size(); ++j) { + difference->mutable_labeled_weights()->Add(part_features[j], + labeled_arc->role(), + value); + } + } + } + } else if ((*parts)[r]->type() == SEMANTICPART_SIBLING && + has_labeled_features) { + // Labeled siblings will be treated by looking at the unlabeled ones and + // conjoining with the label. + CHECK(GetSemanticOptions()->labeled()); + const BinaryFeatures &part_features = + semantic_features->GetLabeledPartFeatures(r); + SemanticPartSibling *sibling = + static_cast((*parts)[r]); + const vector &index_labeled_parts = + semantic_parts->GetLabeledParts(r); + vector bigram_labels(index_labeled_parts.size()); + for (int k = 0; k < index_labeled_parts.size(); ++k) { + int index_part = index_labeled_parts[k]; + CHECK_GE(index_part, 0); + CHECK_LT(index_part, parts->size()); + SemanticPartLabeledSibling *labeled_sibling = + static_cast( + (*parts)[index_part]); + CHECK(labeled_sibling != NULL); + int bigram_label = semantic_dictionary->GetRoleBigramLabel( + labeled_sibling->first_role(), + labeled_sibling->second_role()); + double value = predicted_output[index_part] - gold_output[index_part]; + if (value != 0.0) { + for (int j = 0; j < part_features.size(); ++j) { + difference->mutable_labeled_weights()->Add(part_features[j], + bigram_label, + value); + } + } + } + } + } +} + +void SemanticPipe::MakeParts(Instance *instance, + Parts *parts, + vector *gold_outputs) { + int sentence_length = + static_cast(instance)->size(); + SemanticParts *semantic_parts = static_cast(parts); + semantic_parts->Initialize(); + bool make_gold = (gold_outputs != NULL); + if (make_gold) gold_outputs->clear(); + + if (train_pruner_) { + // For the pruner, make only unlabeled arc-factored and predicate parts and + // compute indices. + MakePartsBasic(instance, false, parts, gold_outputs); + semantic_parts->BuildOffsets(); + semantic_parts->BuildIndices(sentence_length, false); + } else { + // Make arc-factored and predicate parts and compute indices. + MakePartsBasic(instance, parts, gold_outputs); + semantic_parts->BuildOffsets(); + semantic_parts->BuildIndices(sentence_length, + GetSemanticOptions()->labeled()); + + // Make global parts. + MakePartsGlobal(instance, parts, gold_outputs); + semantic_parts->BuildOffsets(); + } +} + +void SemanticPipe::MakePartsBasic(Instance *instance, + Parts *parts, + vector *gold_outputs) { + int sentence_length = + static_cast(instance)->size(); + SemanticParts *semantic_parts = static_cast(parts); + + MakePartsBasic(instance, false, parts, gold_outputs); + semantic_parts->BuildOffsets(); + semantic_parts->BuildIndices(sentence_length, false); + + // Prune using a basic first-order model. + if (GetSemanticOptions()->prune_basic()) { + if (options_->train()) { + Prune(instance, parts, gold_outputs, true); + } else { + Prune(instance, parts, gold_outputs, false); + } + semantic_parts->BuildOffsets(); + semantic_parts->BuildIndices(sentence_length, false); + } + + if (GetSemanticOptions()->labeled()) { + MakePartsBasic(instance, true, parts, gold_outputs); + } +} + +void SemanticPipe::MakePartsBasic(Instance *instance, + bool add_labeled_parts, + Parts *parts, + vector *gold_outputs) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); + SemanticOptions *semantic_options = GetSemanticOptions(); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + bool prune_labels = semantic_options->prune_labels(); + bool prune_labels_with_relation_paths = + semantic_options->prune_labels_with_relation_paths(); + bool prune_labels_with_senses = semantic_options->prune_labels_with_senses(); + bool prune_distances = semantic_options->prune_distances(); + bool allow_self_loops = semantic_options->allow_self_loops(); + bool allow_root_predicate = semantic_options->allow_root_predicate(); + bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); + bool use_predicate_senses = semantic_options->use_predicate_senses(); + vector allowed_labels; + + if (add_labeled_parts && !prune_labels) { + allowed_labels.resize(semantic_dictionary->GetRoleAlphabet().size()); + for (int i = 0; i < allowed_labels.size(); ++i) { + allowed_labels[i] = i; + } + } + + // Add predicate parts. + int num_parts_initial = semantic_parts->size(); + if (!add_labeled_parts) { + for (int p = 0; p < sentence_length; ++p) { + if (p == 0 && !allow_root_predicate) continue; + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + for (int s = 0; s < predicates->size(); ++s) { + Part *part = semantic_parts->CreatePartPredicate(p, s); + semantic_parts->AddPart(part); + if (make_gold) { + bool is_gold = false; + int k = sentence->FindPredicate(p); + if (k >= 0) { + int predicate_id = sentence->GetPredicateId(k); + if (!use_predicate_senses) { + CHECK_EQ((*predicates)[s]->id(), PREDICATE_UNKNOWN); + } + if (predicate_id < 0 || (*predicates)[s]->id() == predicate_id) { + is_gold = true; + } + } + if (is_gold) { + gold_outputs->push_back(1.0); + } else { + gold_outputs->push_back(0.0); + } + } + } + } + + // Compute offsets for predicate parts. + semantic_parts->SetOffsetPredicate(num_parts_initial, + semantic_parts->size() - num_parts_initial); + } + + // Add unlabeled/labeled arc parts. + num_parts_initial = semantic_parts->size(); + for (int p = 0; p < sentence_length; ++p) { + if (p == 0 && !allow_root_predicate) continue; + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + for (int a = 1; a < sentence_length; ++a) { + if (!allow_self_loops && p == a) continue; + for (int s = 0; s < predicates->size(); ++s) { + int arc_index = -1; + if (add_labeled_parts) { + // If no unlabeled arc is there, just skip it. + // This happens if that arc was pruned out. + arc_index = semantic_parts->FindArc(p, a, s); + if (0 > arc_index) { + continue; + } + } else { + if (prune_distances) { + int predicate_pos_id = sentence->GetPosId(p); + int argument_pos_id = sentence->GetPosId(a); + if (p < a) { + // Right attachment. + if (a - p > semantic_dictionary->GetMaximumRightDistance + (predicate_pos_id, argument_pos_id)) continue; + } else { + // Left attachment. + if (p - a > semantic_dictionary->GetMaximumLeftDistance + (predicate_pos_id, argument_pos_id)) continue; + } + } + } + + if (prune_labels_with_relation_paths) { + int relation_path_id = sentence->GetRelationPathId(p, a); + allowed_labels.clear(); + if (relation_path_id >= 0 && + relation_path_id < semantic_dictionary-> + GetRelationPathAlphabet().size()) { + allowed_labels = semantic_dictionary-> + GetExistingRolesWithRelationPath(relation_path_id); + //LOG(INFO) << "Path: " << relation_path_id << " Roles: " << allowed_labels.size(); + } + set label_set; + for (int m = 0; m < allowed_labels.size(); ++m) { + if (!prune_labels_with_senses || + (*predicates)[s]->HasRole(allowed_labels[m])) { + label_set.insert(allowed_labels[m]); + } + } + allowed_labels.clear(); + for (set::iterator it = label_set.begin(); + it != label_set.end(); ++it) { + allowed_labels.push_back(*it); + } + if (!add_labeled_parts && allowed_labels.empty()) { + continue; + } + } else if (prune_labels) { + // TODO: allow both kinds of label pruning simultaneously? + int predicate_pos_id = sentence->GetPosId(p); + int argument_pos_id = sentence->GetPosId(a); + allowed_labels.clear(); + allowed_labels = semantic_dictionary-> + GetExistingRoles(predicate_pos_id, argument_pos_id); + set label_set; + for (int m = 0; m < allowed_labels.size(); ++m) { + if (!prune_labels_with_senses || + (*predicates)[s]->HasRole(allowed_labels[m])) { + label_set.insert(allowed_labels[m]); + } + } + allowed_labels.clear(); + for (set::iterator it = label_set.begin(); + it != label_set.end(); ++it) { + allowed_labels.push_back(*it); + } + if (!add_labeled_parts && allowed_labels.empty()) { + continue; + } + } + + // Add parts for labeled/unlabeled arcs. + if (add_labeled_parts) { + // If there is no allowed label for this arc, but the unlabeled arc was added, + // then it was forced to be present for some reason (e.g. to maintain connectivity of the + // graph). In that case (which should be pretty rare) consider all the + // possible labels. + if (allowed_labels.empty()) { + allowed_labels.resize(semantic_dictionary->GetRoleAlphabet().size()); + for (int role = 0; role < allowed_labels.size(); ++role) { + allowed_labels[role] = role; + } + } + + for (int m = 0; m < allowed_labels.size(); ++m) { + int role = allowed_labels[m]; + if (prune_labels && prune_labels_with_senses) { + CHECK((*predicates)[s]->HasRole(role)); + } + Part *part = semantic_parts->CreatePartLabeledArc(p, a, s, role); + CHECK_GE(arc_index, 0); + semantic_parts->AddLabeledPart(part, arc_index); + if (make_gold) { + int k = sentence->FindPredicate(p); + int l = sentence->FindArc(p, a); + bool is_gold = false; + + if (k >= 0 && l >= 0) { + int predicate_id = sentence->GetPredicateId(k); + int argument_id = sentence->GetArgumentRoleId(k, l); + if (!use_predicate_senses) { + CHECK_EQ((*predicates)[s]->id(), PREDICATE_UNKNOWN); + } + //if (use_predicate_senses) CHECK_LT(predicate_id, 0); + if ((predicate_id < 0 || + (*predicates)[s]->id() == predicate_id) && + role == argument_id) { + is_gold = true; + } + } + if (is_gold) { + gold_outputs->push_back(1.0); + } else { + gold_outputs->push_back(0.0); + } + } + } + } else { + Part *part = semantic_parts->CreatePartArc(p, a, s); + semantic_parts->AddPart(part); + if (make_gold) { + int k = sentence->FindPredicate(p); + int l = sentence->FindArc(p, a); + bool is_gold = false; + if (k >= 0 && l >= 0) { + int predicate_id = sentence->GetPredicateId(k); + if (!use_predicate_senses) { + CHECK_EQ((*predicates)[s]->id(), PREDICATE_UNKNOWN); + } + if (predicate_id < 0 || (*predicates)[s]->id() == predicate_id) { + is_gold = true; + } + } + if (is_gold) { + gold_outputs->push_back(1.0); + } else { + gold_outputs->push_back(0.0); + } + } + } + } + } + } + + // Compute offsets for labeled/unlabeled arcs. + if (!add_labeled_parts) { + semantic_parts->SetOffsetArc(num_parts_initial, + semantic_parts->size() - num_parts_initial); + } else { + semantic_parts->SetOffsetLabeledArc(num_parts_initial, + semantic_parts->size() - num_parts_initial); + } +} + +void SemanticPipe::MakePartsArbitrarySiblings(Instance *instance, + Parts *parts, + vector *gold_outputs) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); + SemanticOptions *semantic_options = GetSemanticOptions(); + //bool allow_self_loops = semantic_options->allow_self_loops(); + bool allow_root_predicate = semantic_options->allow_root_predicate(); + bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); + bool use_predicate_senses = semantic_options->use_predicate_senses(); + + // Siblings: (p,s,a1) and (p,s,a2). + for (int p = 0; p < sentence_length; ++p) { + if (p == 0 && !allow_root_predicate) continue; + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + for (int s = 0; s < predicates->size(); ++s) { + for (int a1 = 1; a1 < sentence_length; ++a1) { + int r1 = semantic_parts->FindArc(p, a1, s); + if (r1 < 0) continue; + for (int a2 = a1 + 1; a2 < sentence_length; ++a2) { + int r2 = semantic_parts->FindArc(p, a2, s); + if (r2 < 0) continue; + Part *part = semantic_parts->CreatePartSibling(p, s, a1, a2); + semantic_parts->AddPart(part); + if (make_gold) { + // Logical AND of the two individual arcs. + gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); + } + } + } + } + } +} + +void SemanticPipe::MakePartsLabeledArbitrarySiblings(Instance *instance, + Parts *parts, + vector *gold_outputs) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); + SemanticOptions *semantic_options = GetSemanticOptions(); + + int offset, size; + semantic_parts->GetOffsetSibling(&offset, &size); + for (int r = offset; r < offset + size; ++r) { + SemanticPartSibling *sibling = + static_cast((*semantic_parts)[r]); + int p = sibling->predicate(); + int s = sibling->sense(); + int a1 = sibling->first_argument(); + int a2 = sibling->second_argument(); + const vector &labeled_first_arc_indices = + semantic_parts->FindLabeledArcs(p, a1, s); + const vector &labeled_second_arc_indices = + semantic_parts->FindLabeledArcs(p, a2, s); + for (int k = 0; k < labeled_first_arc_indices.size(); ++k) { + int r1 = labeled_first_arc_indices[k]; + SemanticPartLabeledArc *first_labeled_arc = + static_cast((*semantic_parts)[r1]); + int first_role = first_labeled_arc->role(); + for (int l = 0; l < labeled_second_arc_indices.size(); ++l) { + int r2 = labeled_second_arc_indices[l]; + SemanticPartLabeledArc *second_labeled_arc = + static_cast((*semantic_parts)[r2]); + int second_role = second_labeled_arc->role(); + // To keep the number of parts manageable, only create parts for: + // - same role (a1 == a2); + // - frequent role pairs. + if (first_role != second_role && + !semantic_dictionary->IsFrequentRolePair(first_role, second_role)) { + continue; + } + Part *part = semantic_parts->CreatePartLabeledSibling(p, s, a1, a2, + first_role, + second_role); + semantic_parts->AddLabeledPart(part, r); + if (make_gold) { + // Logical AND of the two individual labeled arcs. + gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); + } + } + } + } +} + +void SemanticPipe::MakePartsConsecutiveSiblings(Instance *instance, + Parts *parts, + vector *gold_outputs) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); + SemanticOptions *semantic_options = GetSemanticOptions(); + //bool allow_self_loops = semantic_options->allow_self_loops(); + bool allow_root_predicate = semantic_options->allow_root_predicate(); + bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); + bool use_predicate_senses = semantic_options->use_predicate_senses(); + + // Find the predicate parts (necessary to identify the gold predicate senses). + // TODO: Replace this by semantic_parts->GetPredicateSenses(p) or something? + int offset_predicate_parts, num_predicate_parts; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + vector > predicate_part_indices(sentence_length); + for (int r = 0; r < num_predicate_parts; ++r) { + SemanticPartPredicate* predicate_part = + static_cast((*parts)[offset_predicate_parts + r]); + predicate_part_indices[predicate_part->predicate()]. + push_back(offset_predicate_parts + r); + } + + // Consecutive siblings: (p,s,a1) and (p,s,a2). + for (int p = 0; p < sentence_length; ++p) { + if (p == 0 && !allow_root_predicate) continue; + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + //const vector &senses = semantic_parts->GetSenses(p); + //CHECK_EQ(senses.size(), predicates->size()); + for (int s = 0; s < predicates->size(); ++s) { + bool sense_active; + bool first_arc_active; + bool second_arc_active = false; + bool arc_between; + + // Check if this is the correct sense. + if (make_gold) { + //int r = senses[s]; + int r = -1; + for (int k = 0; k < predicate_part_indices[p].size(); ++k) { + r = predicate_part_indices[p][k]; + SemanticPartPredicate* predicate_part = + static_cast((*parts)[r]); + if (predicate_part->sense() == s) break; + } + CHECK_GE(r, 0); + if (r >= 0 && NEARLY_EQ_TOL((*gold_outputs)[r], 1.0, 1e-9)) { + sense_active = true; + } else { + sense_active = false; + } + } + + // Right side. + // Allow self loops (a1 = p). We use a1 = p-1 to denote the special case + // in which a2 is the first argument. + for (int a1 = p - 1; a1 < sentence_length; ++a1) { + int r1 = -1; + if (a1 >= p) { + r1 = semantic_parts->FindArc(p, a1, s); + if (r1 < 0) continue; + } + + if (make_gold) { + // Check if the first arc is active. + if (a1 < p || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { + first_arc_active = true; + } else { + first_arc_active = false; + } + arc_between = false; + } + + for (int a2 = a1 + 1; a2 <= sentence_length; ++a2) { + int r2 = -1; + if (a2 < sentence_length) { + r2 = semantic_parts->FindArc(p, a2, s); + if (r2 < 0) continue; + } + if (make_gold) { + // Check if the second arc is active. + if (a2 == sentence_length || + NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { + second_arc_active = true; + } else { + second_arc_active = false; + } + } + + Part *part = (a1 >= p) ? + semantic_parts->CreatePartConsecutiveSibling(p, s, a1, a2) : + semantic_parts->CreatePartConsecutiveSibling(p, s, -1, a2); + semantic_parts->AddPart(part); + + if (make_gold) { + double value = 0.0; + if (sense_active && first_arc_active && second_arc_active && + !arc_between) { + value = 1.0; + arc_between = true; + } + gold_outputs->push_back(value); + } + } + } + + // Left side. + // NOTE: Self loops (a1 = p) are disabled on the left side, to prevent + // having repeated parts. We use a1 = p+1 to denote the special case + // in which a2 is the first argument. + for (int a1 = p + 1; a1 >= 0; --a1) { + int r1 = -1; + if (a1 <= p) { + r1 = semantic_parts->FindArc(p, a1, s); + if (r1 < 0) continue; + } + if (a1 == p) continue; // See NOTE above. + + if (make_gold) { + // Check if the first arc is active. + if (a1 > p || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { + first_arc_active = true; + } else { + first_arc_active = false; + } + arc_between = false; + } + + for (int a2 = a1 - 1; a2 >= -1; --a2) { + int r2 = -1; + if (a2 > -1) { + r2 = semantic_parts->FindArc(p, a2, s); + if (r2 < 0) continue; + } + if (a2 == p) continue; // See NOTE above. + + if (make_gold) { + // Check if the second arc is active. + if (a2 == -1 || + NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { + second_arc_active = true; + } else { + second_arc_active = false; + } + } + + Part *part = (a1 <= p) ? + semantic_parts->CreatePartConsecutiveSibling(p, s, a1, a2) : + semantic_parts->CreatePartConsecutiveSibling(p, s, -1, a2); + semantic_parts->AddPart(part); + + if (make_gold) { + double value = 0.0; + if (sense_active && first_arc_active && second_arc_active && + !arc_between) { + value = 1.0; + arc_between = true; + } + gold_outputs->push_back(value); + } + } + } + } + } +} + +void SemanticPipe::MakePartsGrandparents(Instance *instance, + Parts *parts, + vector *gold_outputs) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); + SemanticOptions *semantic_options = GetSemanticOptions(); + //bool allow_self_loops = semantic_options->allow_self_loops(); + bool allow_root_predicate = semantic_options->allow_root_predicate(); + bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); + bool use_predicate_senses = semantic_options->use_predicate_senses(); + + // Grandparents: (g,t,p) and (p,s,a). + for (int g = 0; g < sentence_length; ++g) { + if (g == 0 && !allow_root_predicate) continue; + int lemma_id_g = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id_g = sentence->GetLemmaId(g); + CHECK_GE(lemma_id_g, 0); + } + const vector *predicates_g = + &semantic_dictionary->GetLemmaPredicates(lemma_id_g); + if (predicates_g->size() == 0 && allow_unseen_predicates) { + predicates_g = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + for (int t = 0; t < predicates_g->size(); ++t) { + for (int p = 1; p < sentence_length; ++p) { + int r1 = semantic_parts->FindArc(g, p, t); + if (r1 < 0) continue; + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + for (int s = 0; s < predicates->size(); ++s) { + for (int a = 1; a < sentence_length; ++a) { + int r2 = semantic_parts->FindArc(p, a, s); + if (r2 < 0) continue; + Part *part = semantic_parts->CreatePartGrandparent(g, t, p, s, a); + semantic_parts->AddPart(part); + if (make_gold) { + // Logical AND of the two individual arcs. + gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); + } + } + } + } + } + } +} + +void SemanticPipe::MakePartsCoparents(Instance *instance, + Parts *parts, + vector *gold_outputs) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); + SemanticOptions *semantic_options = GetSemanticOptions(); + //bool allow_self_loops = semantic_options->allow_self_loops(); + bool allow_root_predicate = semantic_options->allow_root_predicate(); + bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); + bool use_predicate_senses = semantic_options->use_predicate_senses(); + + // Co-parents: (p1,s1,a) and (p2,s2,a). + // First predicate. + for (int p1 = 0; p1 < sentence_length; ++p1) { + if (p1 == 0 && !allow_root_predicate) continue; + int lemma_id_p1 = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id_p1 = sentence->GetLemmaId(p1); + CHECK_GE(lemma_id_p1, 0); + } + const vector *predicates_p1 = + &semantic_dictionary->GetLemmaPredicates(lemma_id_p1); + if (predicates_p1->size() == 0 && allow_unseen_predicates) { + predicates_p1 = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + for (int s1 = 0; s1 < predicates_p1->size(); ++s1) { + // Second predicate. + for (int p2 = p1 + 1; p2 < sentence_length; ++p2) { + int lemma_id_p2 = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id_p2 = sentence->GetLemmaId(p2); + CHECK_GE(lemma_id_p2, 0); + } + const vector *predicates_p2 = + &semantic_dictionary->GetLemmaPredicates(lemma_id_p2); + if (predicates_p2->size() == 0 && allow_unseen_predicates) { + predicates_p2 = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + for (int s2 = 0; s2 < predicates_p2->size(); ++s2) { + // Common argument. + for (int a = 1; a < sentence_length; ++a) { + int r1 = semantic_parts->FindArc(p1, a, s1); + if (r1 < 0) continue; + int r2 = semantic_parts->FindArc(p2, a, s2); + if (r2 < 0) continue; + Part *part = semantic_parts->CreatePartCoparent(p1, s1, p2, s2, a); + semantic_parts->AddPart(part); + if (make_gold) { + // Logical AND of the two individual arcs. + gold_outputs->push_back((*gold_outputs)[r1] * (*gold_outputs)[r2]); + } + } + } + } + } + } +} + +void SemanticPipe::MakePartsConsecutiveCoparents(Instance *instance, + Parts *parts, + vector *gold_outputs) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + int sentence_length = sentence->size(); + bool make_gold = (gold_outputs != NULL); + SemanticDictionary *semantic_dictionary = GetSemanticDictionary(); + SemanticOptions *semantic_options = GetSemanticOptions(); + //bool allow_self_loops = semantic_options->allow_self_loops(); + bool allow_root_predicate = semantic_options->allow_root_predicate(); + bool allow_unseen_predicates = semantic_options->allow_unseen_predicates(); + bool use_predicate_senses = semantic_options->use_predicate_senses(); + + // Consecutive co-parents: (p1,s1,a) and (p2,s2,a). + for (int a = 1; a < sentence_length; ++a) { + bool first_arc_active; + bool second_arc_active = false; + bool arc_between; + + // Right side. + // Allow self loops (p1 = a). We use p1 = a-1 to denote the special case + // in which p2 is the first predicate. + for (int p1 = a - 1; p1 < sentence_length; ++p1) { + int num_senses1; + if (p1 < a) { + // If p1 = a-1, pretend there is a single sense (s1=0). + num_senses1 = 1; + } else { + //const vector &senses = semantic_parts->GetSenses(p); + //CHECK_EQ(senses.size(), predicates->size()); + if (p1 == 0 && !allow_root_predicate) continue; // Never happens. + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p1); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + num_senses1 = predicates->size(); + } + + for (int s1 = 0; s1 < num_senses1; ++s1) { + int r1 = -1; + if (p1 >= a) { + r1 = semantic_parts->FindArc(p1, a, s1); + if (r1 < 0) continue; + } + + if (make_gold) { + // Check if the first arc is active. + if (p1 < a || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { + first_arc_active = true; + } else { + first_arc_active = false; + } + arc_between = false; + } + + for (int p2 = p1 + 1; p2 <= sentence_length; ++p2) { + int num_senses2; + if (p2 == sentence_length) { + // If p2 = sentence_length, pretend there is a single sense (s2=0). + num_senses2 = 1; + } else { + //const vector &senses = semantic_parts->GetSenses(p); + //CHECK_EQ(senses.size(), predicates->size()); + if (p2 == 0 && !allow_root_predicate) continue; // Never happens. + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p2); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + num_senses2 = predicates->size(); + } + + for (int s2 = 0; s2 < num_senses2; ++s2) { + int r2 = -1; + if (p2 < sentence_length) { + r2 = semantic_parts->FindArc(p2, a, s2); + if (r2 < 0) continue; + } + if (make_gold) { + // Check if the second arc is active. + if (p2 == sentence_length || + NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { + second_arc_active = true; + } else { + second_arc_active = false; + } + } + + Part *part = (p1 >= a) ? + semantic_parts->CreatePartConsecutiveCoparent(p1, s1, p2, s2, a) : + semantic_parts->CreatePartConsecutiveCoparent(-1, 0, p2, s2, a); + semantic_parts->AddPart(part); + + if (make_gold) { + double value = 0.0; + if (first_arc_active && second_arc_active && !arc_between) { + value = 1.0; + arc_between = true; + } + gold_outputs->push_back(value); + } + } + } + } + } + + // Left side. + // NOTE: Self loops (p1 = a) are disabled on the left side, to prevent + // having repeated parts. We use p1 = a+1 to denote the special case + // in which p2 is the first predicate. + for (int p1 = a + 1; p1 >= 0; --p1) { + int num_senses1; + if (p1 > a) { + // If p1 = a+1, pretend there is a single sense (s1=0). + num_senses1 = 1; + } else if (p1 == a) { // See NOTE above. + continue; + } else { + //const vector &senses = semantic_parts->GetSenses(p); + //CHECK_EQ(senses.size(), predicates->size()); + if (p1 == 0 && !allow_root_predicate) continue; + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p1); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + num_senses1 = predicates->size(); + } + + for (int s1 = 0; s1 < num_senses1; ++s1) { + int r1 = -1; + if (p1 <= a) { + r1 = semantic_parts->FindArc(p1, a, s1); + if (r1 < 0) continue; + } + if (p1 == a) continue; // See NOTE above. + + if (make_gold) { + // Check if the first arc is active. + if (p1 > a || NEARLY_EQ_TOL((*gold_outputs)[r1], 1.0, 1e-9)) { + first_arc_active = true; + } else { + first_arc_active = false; + } + arc_between = false; + } + + for (int p2 = p1 - 1; p2 >= -1; --p2) { + int num_senses2; + if (p2 == -1) { + // If p2 = -1, pretend there is a single sense (s2=0). + num_senses2 = 1; + } else if (p2 == a) { // See NOTE above. + continue; + } else { + //const vector &senses = semantic_parts->GetSenses(p); + //CHECK_EQ(senses.size(), predicates->size()); + if (p2 == 0 && !allow_root_predicate) continue; + int lemma_id = TOKEN_UNKNOWN; + if (use_predicate_senses) { + lemma_id = sentence->GetLemmaId(p2); + CHECK_GE(lemma_id, 0); + } + const vector *predicates = + &semantic_dictionary->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && allow_unseen_predicates) { + predicates = &semantic_dictionary->GetLemmaPredicates(TOKEN_UNKNOWN); + } + num_senses2 = predicates->size(); + } + + for (int s2 = 0; s2 < num_senses2; ++s2) { + int r2 = -1; + if (p2 > -1) { + r2 = semantic_parts->FindArc(p2, a, s2); + if (r2 < 0) continue; + } + if (p2 == a) continue; // See NOTE above. + + if (make_gold) { + // Check if the second arc is active. + if (p2 == -1 || + NEARLY_EQ_TOL((*gold_outputs)[r2], 1.0, 1e-9)) { + second_arc_active = true; + } else { + second_arc_active = false; + } + } + + Part *part = (p1 <= a) ? + semantic_parts->CreatePartConsecutiveCoparent(p1, s1, p2, s2, a) : + semantic_parts->CreatePartConsecutiveCoparent(-1, 0, p2, s2, a); + semantic_parts->AddPart(part); + + if (make_gold) { + double value = 0.0; + if (first_arc_active && second_arc_active && !arc_between) { + value = 1.0; + arc_between = true; + } + gold_outputs->push_back(value); + } + } + } + } + } + } +} + +void SemanticPipe::MakePartsGlobal(Instance *instance, + Parts *parts, + vector *gold_outputs) { + SemanticOptions *semantic_options = GetSemanticOptions(); + SemanticParts *semantic_parts = static_cast(parts); + + int num_parts_initial = semantic_parts->size(); + if (semantic_options->use_arbitrary_siblings()) { + MakePartsArbitrarySiblings(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetSibling(num_parts_initial, + semantic_parts->size() - num_parts_initial); + //LOG(INFO) << "Num siblings: " << semantic_parts->size() - num_parts_initial; + + num_parts_initial = semantic_parts->size(); + if (semantic_options->use_arbitrary_siblings() && + FLAGS_use_labeled_sibling_features) { + MakePartsLabeledArbitrarySiblings(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetLabeledSibling( + num_parts_initial, semantic_parts->size() - num_parts_initial); + //LOG(INFO) << "Num labeled siblings: " << semantic_parts->size() - num_parts_initial; + + num_parts_initial = semantic_parts->size(); + if (semantic_options->use_consecutive_siblings()) { + MakePartsConsecutiveSiblings(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetConsecutiveSibling(num_parts_initial, + semantic_parts->size() - num_parts_initial); + + num_parts_initial = semantic_parts->size(); + if (semantic_options->use_grandparents()) { + MakePartsGrandparents(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetGrandparent(num_parts_initial, + semantic_parts->size() - num_parts_initial); + + num_parts_initial = semantic_parts->size(); + if (semantic_options->use_coparents()) { + MakePartsCoparents(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetCoparent(num_parts_initial, + semantic_parts->size() - num_parts_initial); + + num_parts_initial = semantic_parts->size(); + if (semantic_options->use_consecutive_coparents()) { + MakePartsConsecutiveCoparents(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetConsecutiveCoparent(num_parts_initial, + semantic_parts->size() - num_parts_initial); + +#if 0 + num_parts_initial = semantic_parts->size(); + if (semantic_options->use_grandsiblings()) { + MakePartsGrandSiblings(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetGrandSiblings(num_parts_initial, + semantic_parts->size() - num_parts_initial); + + num_parts_initial = semantic_parts->size(); + if (semantic_options->use_trisiblings()) { + MakePartsTriSiblings(instance, parts, gold_outputs); + } + semantic_parts->SetOffsetTriSiblings(num_parts_initial, + semantic_parts->size() - num_parts_initial); +#endif +} + +void SemanticPipe::MakeSelectedFeatures(Instance *instance, + Parts *parts, + bool pruner, + const vector& selected_parts, + Features *features) { + SemanticInstanceNumeric *sentence = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + SemanticFeatures *semantic_features = + static_cast(features); + int sentence_length = sentence->size(); + + semantic_features->Initialize(instance, parts); + + // Build features for predicates. + int offset, size; + semantic_parts->GetOffsetPredicate(&offset, &size); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartPredicate *predicate_part = + static_cast((*semantic_parts)[r]); + // Get the predicate id for this part. + // TODO(atm): store this somewhere, so that we don't need to recompute this + // all the time. + int lemma_id = TOKEN_UNKNOWN; + if (GetSemanticOptions()->use_predicate_senses()) { + lemma_id = sentence->GetLemmaId(predicate_part->predicate()); + } + const vector *predicates = + &GetSemanticDictionary()->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && + GetSemanticOptions()->allow_unseen_predicates()) { + predicates = &GetSemanticDictionary()->GetLemmaPredicates(TOKEN_UNKNOWN); + } + int predicate_id = (*predicates)[predicate_part->sense()]->id(); + // Add the predicate features. + semantic_features->AddPredicateFeatures(sentence, r, + predicate_part->predicate(), + predicate_id); + } + + // Even in the case of labeled parsing, build features for unlabeled arcs + // only. They will later be conjoined with the labels. + semantic_parts->GetOffsetArc(&offset, &size); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartArc *arc = + static_cast((*semantic_parts)[r]); + // Get the predicate id for this part. + // TODO(atm): store this somewhere, so that we don't need to recompute this + // all the time. Maybe store this directly in arc->sense()? + int lemma_id = TOKEN_UNKNOWN; + if (GetSemanticOptions()->use_predicate_senses()) { + lemma_id = sentence->GetLemmaId(arc->predicate()); + } + const vector *predicates = + &GetSemanticDictionary()->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && + GetSemanticOptions()->allow_unseen_predicates()) { + predicates = &GetSemanticDictionary()->GetLemmaPredicates(TOKEN_UNKNOWN); + } + int predicate_id = (*predicates)[arc->sense()]->id(); + if (!pruner && GetSemanticOptions()->labeled()) { + semantic_features->AddLabeledArcFeatures(sentence, r, arc->predicate(), + arc->argument(), predicate_id); + if (!FLAGS_use_only_labeled_arc_features) { + semantic_features->AddArcFeatures(sentence, r, arc->predicate(), + arc->argument(), predicate_id); + } + } else { + semantic_features->AddArcFeatures(sentence, r, arc->predicate(), + arc->argument(), predicate_id); + } + } + + // Build features for arbitrary siblings. + semantic_parts->GetOffsetSibling(&offset, &size); + if (pruner) CHECK_EQ(size, 0); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartSibling *part = + static_cast((*semantic_parts)[r]); + CHECK_EQ(part->type(), SEMANTICPART_SIBLING); + if (FLAGS_use_labeled_sibling_features) { + semantic_features-> + AddArbitraryLabeledSiblingFeatures(sentence, r, + part->predicate(), + part->sense(), + part->first_argument(), + part->second_argument()); + if (!FLAGS_use_only_labeled_sibling_features) { + semantic_features->AddArbitrarySiblingFeatures(sentence, r, + part->predicate(), + part->sense(), + part->first_argument(), + part->second_argument()); + } + } else { + semantic_features->AddArbitrarySiblingFeatures(sentence, r, + part->predicate(), + part->sense(), + part->first_argument(), + part->second_argument()); + } + } + + // Build features for consecutive siblings. + semantic_parts->GetOffsetConsecutiveSibling(&offset, &size); + if (pruner) CHECK_EQ(size, 0); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartConsecutiveSibling *part = + static_cast((*semantic_parts)[r]); + CHECK_EQ(part->type(), SEMANTICPART_CONSECUTIVESIBLING); + semantic_features->AddConsecutiveSiblingFeatures( + sentence, r, + part->predicate(), + part->sense(), + part->first_argument(), + part->second_argument()); + } + + // Build features for grandparents. + semantic_parts->GetOffsetGrandparent(&offset, &size); + if (pruner) CHECK_EQ(size, 0); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartGrandparent *part = + static_cast((*semantic_parts)[r]); + CHECK_EQ(part->type(), SEMANTICPART_GRANDPARENT); + semantic_features->AddGrandparentFeatures(sentence, r, + part->grandparent_predicate(), + part->grandparent_sense(), + part->predicate(), + part->sense(), + part->argument()); + } + + // Build features for co-parents. + semantic_parts->GetOffsetCoparent(&offset, &size); + if (pruner) CHECK_EQ(size, 0); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartCoparent *part = + static_cast((*semantic_parts)[r]); + CHECK_EQ(part->type(), SEMANTICPART_COPARENT); + semantic_features->AddCoparentFeatures(sentence, r, + part->first_predicate(), + part->first_sense(), + part->second_predicate(), + part->second_sense(), + part->argument()); + } + + // Build features for consecutive co-parents. + semantic_parts->GetOffsetConsecutiveCoparent(&offset, &size); + if (pruner) CHECK_EQ(size, 0); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartConsecutiveCoparent *part = + static_cast((*semantic_parts)[r]); + CHECK_EQ(part->type(), SEMANTICPART_CONSECUTIVECOPARENT); + semantic_features->AddConsecutiveCoparentFeatures( + sentence, r, + part->first_predicate(), + part->first_sense(), + part->second_predicate(), + part->second_sense(), + part->argument()); + } + +#if 0 + // Build features for grand-siblings. + dependency_parts->GetOffsetGrandSibl(&offset, &size); + if (pruner) CHECK_EQ(size, 0); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartGrandSibl *part = + static_cast((*dependency_parts)[r]); + CHECK_EQ(part->type(), DEPENDENCYPART_GRANDSIBL); + CHECK_LE(part->modifier(), sentence_length); + CHECK_LE(part->sibling(), sentence_length); + dependency_features->AddGrandSiblingFeatures(sentence, r, + part->grandparent(), + part->head(), + part->modifier(), + part->sibling()); + } + + // Build features for tri-siblings. + dependency_parts->GetOffsetTriSibl(&offset, &size); + if (pruner) CHECK_EQ(size, 0); + for (int r = offset; r < offset + size; ++r) { + if (!selected_parts[r]) continue; + SemanticPartTriSibl *part = + static_cast((*dependency_parts)[r]); + CHECK_EQ(part->type(), DEPENDENCYPART_TRISIBL); + dependency_features->AddTriSiblingFeatures(sentence, r, + part->head(), + part->modifier(), + part->sibling(), + part->other_sibling()); + } + +#endif +} + +// Prune basic parts (arcs and labeled arcs) using a first-order model. +// The vectors of basic parts is given as input, and those elements that are +// to be pruned are deleted from the vector. +// If gold_outputs is not NULL that vector will also be pruned. +void SemanticPipe::Prune(Instance *instance, Parts *parts, + vector *gold_outputs, + bool preserve_gold) { + SemanticParts *semantic_parts = static_cast(parts); + Features *features = CreateFeatures(); + vector scores; + vector predicted_outputs; + + // Make sure gold parts are only preserved at training time. + CHECK(!preserve_gold || options_->train()); + + MakeFeatures(instance, parts, true, features); + ComputeScores(instance, parts, features, true, &scores); + GetSemanticDecoder()->DecodePruner(instance, parts, scores, + &predicted_outputs); + + int offset_predicate_parts, num_predicate_parts; + int offset_arcs, num_arcs; + semantic_parts->GetOffsetPredicate(&offset_predicate_parts, + &num_predicate_parts); + semantic_parts->GetOffsetArc(&offset_arcs, &num_arcs); + + double threshold = 0.5; + int r0 = offset_arcs; // Preserve all the predicate parts. + semantic_parts->ClearOffsets(); + semantic_parts->SetOffsetPredicate(offset_predicate_parts, + num_predicate_parts); + for (int r = 0; r < num_arcs; ++r) { + // Preserve gold parts (at training time). + if (predicted_outputs[offset_arcs + r] >= threshold || + (preserve_gold && (*gold_outputs)[offset_arcs + r] >= threshold)) { + (*parts)[r0] = (*parts)[offset_arcs + r]; + semantic_parts-> + SetLabeledParts(r0, semantic_parts->GetLabeledParts(offset_arcs + r)); + if (gold_outputs) { + (*gold_outputs)[r0] = (*gold_outputs)[offset_arcs + r]; + } + ++r0; + } else { + delete (*parts)[offset_arcs + r]; + } + } + + if (gold_outputs) gold_outputs->resize(r0); + semantic_parts->Resize(r0); + semantic_parts->DeleteIndices(); + semantic_parts->SetOffsetArc(offset_arcs, + parts->size() - offset_arcs); + + delete features; +} + +void SemanticPipe::LabelInstance(Parts *parts, + const vector &output, + Instance *instance) { + SemanticParts *semantic_parts = static_cast(parts); + SemanticInstance *semantic_instance = + static_cast(instance); + SemanticDictionary *semantic_dictionary = + static_cast(dictionary_); + //bool allow_root_predicate = GetSemanticOptions()->allow_root_predicate(); + int instance_length = semantic_instance->size(); + double threshold = 0.5; + semantic_instance->ClearPredicates(); + for (int p = 0; p < instance_length; ++p) { + //if (p == 0 && !allow_root_predicate) continue; + const vector &senses = semantic_parts->GetSenses(p); + vector argument_indices; + vector argument_roles; + int predicted_sense = -1; + for (int k = 0; k < senses.size(); k++) { + int s = senses[k]; + for (int a = 1; a < instance_length; ++a) { + if (GetSemanticOptions()->labeled()) { + int r = semantic_parts->FindArc(p, a, s); + if (r < 0) continue; + const vector &labeled_arcs = + semantic_parts->FindLabeledArcs(p, a, s); + for (int l = 0; l < labeled_arcs.size(); ++l) { + int r = labeled_arcs[l]; + if (output[r] > threshold) { + if (predicted_sense != s) { + CHECK_LT(predicted_sense, 0); + predicted_sense = s; + } + argument_indices.push_back(a); + SemanticPartLabeledArc *labeled_arc = + static_cast((*parts)[r]); + string role = + semantic_dictionary->GetRoleName(labeled_arc->role()); + argument_roles.push_back(role); + } + } + } else { + int r = semantic_parts->FindArc(p, a, s); + if (r < 0) continue; + if (output[r] > threshold) { + if (predicted_sense != s) { + CHECK_LT(predicted_sense, 0); + predicted_sense = s; + } + argument_indices.push_back(a); + argument_roles.push_back("ARG"); + } + } + } + } + + if (predicted_sense >= 0) { + int s = predicted_sense; + // Get the predicate id for this part. + // TODO(atm): store this somewhere, so that we don't need to recompute this + // all the time. Maybe store this directly in arc->sense()? + int lemma_id = TOKEN_UNKNOWN; + if (GetSemanticOptions()->use_predicate_senses()) { + lemma_id = semantic_dictionary->GetTokenDictionary()-> + GetLemmaId(semantic_instance->GetLemma(p)); + if (lemma_id < 0) lemma_id = TOKEN_UNKNOWN; + } + const vector *predicates = + &GetSemanticDictionary()->GetLemmaPredicates(lemma_id); + if (predicates->size() == 0 && + GetSemanticOptions()->allow_unseen_predicates()) { + predicates = &GetSemanticDictionary()->GetLemmaPredicates(TOKEN_UNKNOWN); + } + int predicate_id = (*predicates)[s]->id(); + string predicate_name = + semantic_dictionary->GetPredicateName(predicate_id); + semantic_instance->AddPredicate(predicate_name, p, argument_roles, + argument_indices); + } + } +} diff --git a/src/semantic_parser/SemanticPipe.h b/src/semantic_parser/SemanticPipe.h index 10d8888..b5db4e5 100644 --- a/src/semantic_parser/SemanticPipe.h +++ b/src/semantic_parser/SemanticPipe.h @@ -1,410 +1,409 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEMANTICPIPE_H_ -#define SEMANTICPIPE_H_ - -#include "Pipe.h" -#include "SemanticOptions.h" -#include "SemanticReader.h" -#include "SemanticDictionary.h" -#include "TokenDictionary.h" -#include "SemanticInstanceNumeric.h" -#include "SemanticWriter.h" -#include "SemanticPart.h" -#include "SemanticFeatures.h" -#include "SemanticDecoder.h" - -class SemanticPipe : public Pipe { -public: - SemanticPipe(Options* options) : Pipe(options) { - token_dictionary_ = NULL; - dependency_dictionary_ = NULL; - pruner_parameters_ = NULL; - train_pruner_ = false; - } - virtual ~SemanticPipe() { - delete token_dictionary_; - delete dependency_dictionary_; - delete pruner_parameters_; - } - - SemanticReader *GetSemanticReader() { - return static_cast(reader_); - } - SemanticDictionary *GetSemanticDictionary() { - return static_cast(dictionary_); - } - SemanticDecoder *GetSemanticDecoder() { - return static_cast(decoder_); - } - SemanticOptions *GetSemanticOptions() { - return static_cast(options_); - } - - void Initialize() { - Pipe::Initialize(); - pruner_parameters_ = new Parameters; - } - - void SetPrunerParameters(Parameters *pruner_parameters) { - pruner_parameters_ = pruner_parameters; - } - void LoadPrunerModelFile() { - LoadPrunerModelByName(GetSemanticOptions()->GetPrunerModelFilePath()); - } - -protected: - void CreateDictionary() { - dictionary_ = new SemanticDictionary(this); - GetSemanticDictionary()->SetTokenDictionary(token_dictionary_); - GetSemanticDictionary()->SetDependencyDictionary(dependency_dictionary_); - } - void CreateReader() { - reader_ = new SemanticReader(options_); - } - void CreateWriter() { - writer_ = new SemanticWriter(options_); - } - void CreateDecoder() { decoder_ = new SemanticDecoder(this); } - Parts *CreateParts() { return new SemanticParts; } - Features *CreateFeatures() { return new SemanticFeatures(this); } - - void CreateTokenDictionary() { - token_dictionary_ = new TokenDictionary(this); - } - - void CreateDependencyDictionary() { - dependency_dictionary_ = new DependencyDictionary(this); - } - - Parameters *GetTrainingParameters() { - if (train_pruner_) return pruner_parameters_; - return parameters_; - } - - void PreprocessData(); - - Instance *GetFormattedInstance(Instance *instance) { - SemanticInstanceNumeric *instance_numeric = - new SemanticInstanceNumeric; - instance_numeric->Initialize(*GetSemanticDictionary(), - static_cast(instance)); - return instance_numeric; - } - - void SaveModel(FILE* fs); - void LoadModel(FILE* fs); - - void LoadPrunerModel(FILE* fs); - void LoadPrunerModelByName(const string &model_name); - - void MakeParts(Instance *instance, Parts *parts, - vector *gold_outputs); - void MakePartsBasic(Instance *instance, Parts *parts, - vector *gold_outputs); - void MakePartsBasic(Instance *instance, bool add_labeled_parts, Parts *parts, - vector *gold_outputs); - void MakePartsGlobal(Instance *instance, Parts *parts, - vector *gold_outputs); - void MakePartsArbitrarySiblings(Instance *instance, - Parts *parts, - vector *gold_outputs); - void MakePartsLabeledArbitrarySiblings(Instance *instance, - Parts *parts, - vector *gold_outputs); - void MakePartsConsecutiveSiblings(Instance *instance, - Parts *parts, - vector *gold_outputs); - void MakePartsGrandparents(Instance *instance, - Parts *parts, - vector *gold_outputs); - void MakePartsCoparents(Instance *instance, - Parts *parts, - vector *gold_outputs); - void MakePartsConsecutiveCoparents(Instance *instance, - Parts *parts, - vector *gold_outputs); - - void MakeFeatures(Instance *instance, Parts *parts, bool pruner, - Features *features) { - vector selected_parts(parts->size(), true); - MakeSelectedFeatures(instance, parts, pruner, selected_parts, features); - } - void MakeSelectedFeatures(Instance *instance, Parts *parts, - const vector& selected_parts, Features *features) { - // Set pruner = false unless we're training the pruner. - MakeSelectedFeatures(instance, parts, train_pruner_, selected_parts, - features); - } - void MakeSelectedFeatures(Instance *instance, - Parts *parts, - bool pruner, - const vector& selected_parts, - Features *features); - - void ComputeScores(Instance *instance, Parts *parts, Features *features, - vector *scores) { - // Set pruner = false unless we're training the pruner. - ComputeScores(instance, parts, features, train_pruner_, scores); - } - void ComputeScores(Instance *instance, Parts *parts, Features *features, - bool pruner, vector *scores); - - void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, - bool pruner, - const vector &selected_parts, - Features *features); - - void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, - const vector &selected_parts, - Features *features) { - // Set pruner = false unless we're training the pruner. - RemoveUnsupportedFeatures(instance, parts, train_pruner_, selected_parts, - features); - } - - void MakeFeatureDifference(Parts *parts, - Features *features, - const vector &gold_output, - const vector &predicted_output, - FeatureVector *difference); - - void MakeGradientStep(Parts *parts, - Features *features, - double eta, - int iteration, - const vector &gold_output, - const vector &predicted_output); - - void TouchParameters(Parts *parts, Features *features, - const vector &selected_parts); - - void LabelInstance(Parts *parts, const vector &output, - Instance *instance); - - void Prune(Instance *instance, Parts *parts, vector *gold_outputs, - bool preserve_gold); - - virtual void BeginEvaluation() { - num_predicted_unlabeled_arcs_ = 0; - num_gold_unlabeled_arcs_ = 0; - num_matched_unlabeled_arcs_ = 0; - num_tokens_ = 0; - num_unlabeled_arcs_after_pruning_ = 0; - num_pruned_gold_unlabeled_arcs_ = 0; - num_possible_unlabeled_arcs_ = 0; - num_predicted_labeled_arcs_ = 0; - num_gold_labeled_arcs_ = 0; - num_matched_labeled_arcs_ = 0; - num_labeled_arcs_after_pruning_ = 0; - num_pruned_gold_labeled_arcs_ = 0; - num_possible_labeled_arcs_ = 0; - gettimeofday(&start_clock_, NULL); - } - virtual void EvaluateInstance(Instance *instance, - Instance *output_instance, - Parts *parts, - const vector &gold_outputs, - const vector &predicted_outputs) { - int num_possible_unlabeled_arcs = 0; - int num_possible_labeled_arcs = 0; - int num_gold_unlabeled_arcs = 0; - int num_gold_labeled_arcs = 0; - SemanticInstance *semantic_instance = - static_cast(instance); - SemanticParts *semantic_parts = static_cast(parts); - for (int p = 0; p < semantic_instance->size(); ++p) { - const vector &senses = semantic_parts->GetSenses(p); - for (int a = 1; a < semantic_instance->size(); ++a) { - for (int k = 0; k < senses.size(); ++k) { - int s = senses[k]; - int r = semantic_parts->FindArc(p, a, s); - if (r < 0) continue; - ++num_possible_unlabeled_arcs; - if (gold_outputs[r] >= 0.5) { - CHECK_EQ(gold_outputs[r], 1.0); - if (NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { - ++num_matched_unlabeled_arcs_; - } - ++num_gold_unlabeled_arcs; - } - if (predicted_outputs[r] >= 0.5) { - CHECK_EQ(predicted_outputs[r], 1.0); - ++num_predicted_unlabeled_arcs_; - - //LOG(INFO) << semantic_instance->GetForm(a) - // << " <-- " - // << semantic_instance->GetForm(p); - } - if (GetSemanticOptions()->labeled()) { - const vector &labeled_arcs = - semantic_parts->FindLabeledArcs(p, a, s); - for (int k = 0; k < labeled_arcs.size(); ++k) { - int r = labeled_arcs[k]; - if (r < 0) continue; - ++num_possible_labeled_arcs; - if (gold_outputs[r] >= 0.5) { - CHECK_EQ(gold_outputs[r], 1.0); - if (NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { - ++num_matched_labeled_arcs_; - - //LOG(INFO) << semantic_instance->GetForm(a) - // << " <-*- " - // << semantic_instance->GetForm(p); - } - ++num_gold_labeled_arcs; - } - if (predicted_outputs[r] >= 0.5) { - CHECK_EQ(predicted_outputs[r], 1.0); - ++num_predicted_labeled_arcs_; - } - } - } - } - } - - ++num_tokens_; - num_unlabeled_arcs_after_pruning_ += num_possible_unlabeled_arcs; - num_labeled_arcs_after_pruning_ += num_possible_labeled_arcs; - } - - int num_actual_gold_arcs = 0; - for (int k = 0; k < semantic_instance->GetNumPredicates(); ++k) { - num_actual_gold_arcs += - semantic_instance->GetNumArgumentsPredicate(k); - } - num_gold_unlabeled_arcs_ += num_actual_gold_arcs; - num_gold_labeled_arcs_ += num_actual_gold_arcs; - int missed_unlabeled = num_actual_gold_arcs - num_gold_unlabeled_arcs; - int missed_labeled = num_actual_gold_arcs - num_gold_labeled_arcs; - //if (missed > 0) { - // LOG(INFO) << "Missed " << missed << " unlabeled arcs."; - //} - num_pruned_gold_unlabeled_arcs_ += missed_unlabeled; - num_possible_unlabeled_arcs_ += num_possible_unlabeled_arcs; - num_pruned_gold_labeled_arcs_ += missed_labeled; - num_possible_labeled_arcs_ += num_possible_labeled_arcs; - } - - virtual void EndEvaluation() { - double unlabeled_precision = - static_cast(num_matched_unlabeled_arcs_) / - static_cast(num_predicted_unlabeled_arcs_); - double unlabeled_recall = - static_cast(num_matched_unlabeled_arcs_) / - static_cast(num_gold_unlabeled_arcs_); - double unlabeled_F1 = 2.0 * unlabeled_precision * unlabeled_recall / - (unlabeled_precision + unlabeled_recall); - double pruning_unlabeled_recall = - static_cast(num_gold_unlabeled_arcs_ - - num_pruned_gold_unlabeled_arcs_) / - static_cast(num_gold_unlabeled_arcs_); - double pruning_unlabeled_efficiency = - static_cast(num_possible_unlabeled_arcs_) / - static_cast(num_tokens_); - - double labeled_precision = - static_cast(num_matched_labeled_arcs_) / - static_cast(num_predicted_labeled_arcs_); - double labeled_recall = - static_cast(num_matched_labeled_arcs_) / - static_cast(num_gold_labeled_arcs_); - double labeled_F1 = 2.0 * labeled_precision * labeled_recall / - (labeled_precision + labeled_recall); - double pruning_labeled_recall = - static_cast(num_gold_labeled_arcs_ - - num_pruned_gold_labeled_arcs_) / - static_cast(num_gold_labeled_arcs_); - double pruning_labeled_efficiency = - static_cast(num_possible_labeled_arcs_) / - static_cast(num_tokens_); - - LOG(INFO) << "Unlabeled precision: " << unlabeled_precision - << " (" << num_matched_unlabeled_arcs_ << "/" - << num_predicted_unlabeled_arcs_ << ")"; - LOG(INFO) << "Unlabeled recall: " << unlabeled_recall - << " (" << num_matched_unlabeled_arcs_ << "/" - << num_gold_unlabeled_arcs_ << ")"; - LOG(INFO) << "Unlabeled F1: " << unlabeled_F1; - LOG(INFO) << "Pruning unlabeled recall: " << pruning_unlabeled_recall - << " (" - << num_gold_unlabeled_arcs_ - num_pruned_gold_unlabeled_arcs_ - << "/" - << num_gold_unlabeled_arcs_ << ")"; - LOG(INFO) << "Pruning unlabeled efficiency: " << pruning_unlabeled_efficiency - << " possible unlabeled arcs per token" - << " (" << num_possible_unlabeled_arcs_ << "/" - << num_tokens_ << ")"; - - LOG(INFO) << "Labeled precision: " << labeled_precision - << " (" << num_matched_labeled_arcs_ << "/" - << num_predicted_labeled_arcs_ << ")"; - LOG(INFO) << "Labeled recall: " << labeled_recall - << " (" << num_matched_labeled_arcs_ << "/" - << num_gold_labeled_arcs_ << ")"; - LOG(INFO) << "Labeled F1: " << labeled_F1; - LOG(INFO) << "Pruning labeled recall: " << pruning_labeled_recall - << " (" - << num_gold_labeled_arcs_ - num_pruned_gold_labeled_arcs_ - << "/" - << num_gold_labeled_arcs_ << ")"; - LOG(INFO) << "Pruning labeled efficiency: " << pruning_labeled_efficiency - << " possible labeled arcs per token" - << " (" << num_possible_labeled_arcs_ << "/" - << num_tokens_ << ")"; - - timeval end_clock; - gettimeofday(&end_clock, NULL); - double num_seconds = - static_cast(diff_ms(end_clock, start_clock_)) / 1000.0; - double tokens_per_second = static_cast(num_tokens_) / num_seconds; - LOG(INFO) << "Speed: " - << tokens_per_second << " tokens per second."; - } - -#if 0 - void GetAllAncestors(const vector &heads, - int descend, - vector* ancestors); - bool ExistsPath(const vector &heads, - int ancest, - int descend); -#endif -protected: - TokenDictionary *token_dictionary_; - DependencyDictionary *dependency_dictionary_; - bool train_pruner_; - Parameters *pruner_parameters_; - int num_predicted_unlabeled_arcs_; - int num_gold_unlabeled_arcs_; - int num_matched_unlabeled_arcs_; - int num_tokens_; - int num_unlabeled_arcs_after_pruning_; - int num_pruned_gold_unlabeled_arcs_; - int num_possible_unlabeled_arcs_; - int num_predicted_labeled_arcs_; - int num_gold_labeled_arcs_; - int num_matched_labeled_arcs_; - int num_labeled_arcs_after_pruning_; - int num_pruned_gold_labeled_arcs_; - int num_possible_labeled_arcs_; - timeval start_clock_; -}; - -#endif /* SEMANTICPIPE_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEMANTICPIPE_H_ +#define SEMANTICPIPE_H_ + +#include "Pipe.h" +#include "TimeUtils.h" +#include "SemanticOptions.h" +#include "SemanticReader.h" +#include "SemanticDictionary.h" +#include "TokenDictionary.h" +#include "SemanticInstanceNumeric.h" +#include "SemanticWriter.h" +#include "SemanticPart.h" +#include "SemanticFeatures.h" +#include "SemanticDecoder.h" + +class SemanticPipe : public Pipe { +public: + SemanticPipe(Options* options) : Pipe(options) { + token_dictionary_ = NULL; + dependency_dictionary_ = NULL; + pruner_parameters_ = NULL; + train_pruner_ = false; + } + virtual ~SemanticPipe() { + delete token_dictionary_; + delete dependency_dictionary_; + delete pruner_parameters_; + } + + SemanticReader *GetSemanticReader() { + return static_cast(reader_); + } + SemanticDictionary *GetSemanticDictionary() { + return static_cast(dictionary_); + } + SemanticDecoder *GetSemanticDecoder() { + return static_cast(decoder_); + } + SemanticOptions *GetSemanticOptions() { + return static_cast(options_); + } + + void Initialize() { + Pipe::Initialize(); + pruner_parameters_ = new Parameters; + } + + void SetPrunerParameters(Parameters *pruner_parameters) { + pruner_parameters_ = pruner_parameters; + } + void LoadPrunerModelFile() { + LoadPrunerModelByName(GetSemanticOptions()->GetPrunerModelFilePath()); + } + +protected: + void CreateDictionary() { + dictionary_ = new SemanticDictionary(this); + GetSemanticDictionary()->SetTokenDictionary(token_dictionary_); + GetSemanticDictionary()->SetDependencyDictionary(dependency_dictionary_); + } + void CreateReader() { + reader_ = new SemanticReader(options_); + } + void CreateWriter() { + writer_ = new SemanticWriter(options_); + } + void CreateDecoder() { decoder_ = new SemanticDecoder(this); } + Parts *CreateParts() { return new SemanticParts; } + Features *CreateFeatures() { return new SemanticFeatures(this); } + + void CreateTokenDictionary() { + token_dictionary_ = new TokenDictionary(this); + } + + void CreateDependencyDictionary() { + dependency_dictionary_ = new DependencyDictionary(this); + } + + Parameters *GetTrainingParameters() { + if (train_pruner_) return pruner_parameters_; + return parameters_; + } + + void PreprocessData(); + + Instance *GetFormattedInstance(Instance *instance) { + SemanticInstanceNumeric *instance_numeric = + new SemanticInstanceNumeric; + instance_numeric->Initialize(*GetSemanticDictionary(), + static_cast(instance)); + return instance_numeric; + } + + void SaveModel(FILE* fs); + void LoadModel(FILE* fs); + + void LoadPrunerModel(FILE* fs); + void LoadPrunerModelByName(const string &model_name); + + void MakeParts(Instance *instance, Parts *parts, + vector *gold_outputs); + void MakePartsBasic(Instance *instance, Parts *parts, + vector *gold_outputs); + void MakePartsBasic(Instance *instance, bool add_labeled_parts, Parts *parts, + vector *gold_outputs); + void MakePartsGlobal(Instance *instance, Parts *parts, + vector *gold_outputs); + void MakePartsArbitrarySiblings(Instance *instance, + Parts *parts, + vector *gold_outputs); + void MakePartsLabeledArbitrarySiblings(Instance *instance, + Parts *parts, + vector *gold_outputs); + void MakePartsConsecutiveSiblings(Instance *instance, + Parts *parts, + vector *gold_outputs); + void MakePartsGrandparents(Instance *instance, + Parts *parts, + vector *gold_outputs); + void MakePartsCoparents(Instance *instance, + Parts *parts, + vector *gold_outputs); + void MakePartsConsecutiveCoparents(Instance *instance, + Parts *parts, + vector *gold_outputs); + + void MakeFeatures(Instance *instance, Parts *parts, bool pruner, + Features *features) { + vector selected_parts(parts->size(), true); + MakeSelectedFeatures(instance, parts, pruner, selected_parts, features); + } + void MakeSelectedFeatures(Instance *instance, Parts *parts, + const vector& selected_parts, Features *features) { + // Set pruner = false unless we're training the pruner. + MakeSelectedFeatures(instance, parts, train_pruner_, selected_parts, + features); + } + void MakeSelectedFeatures(Instance *instance, + Parts *parts, + bool pruner, + const vector& selected_parts, + Features *features); + + void ComputeScores(Instance *instance, Parts *parts, Features *features, + vector *scores) { + // Set pruner = false unless we're training the pruner. + ComputeScores(instance, parts, features, train_pruner_, scores); + } + void ComputeScores(Instance *instance, Parts *parts, Features *features, + bool pruner, vector *scores); + + void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, + bool pruner, + const vector &selected_parts, + Features *features); + + void RemoveUnsupportedFeatures(Instance *instance, Parts *parts, + const vector &selected_parts, + Features *features) { + // Set pruner = false unless we're training the pruner. + RemoveUnsupportedFeatures(instance, parts, train_pruner_, selected_parts, + features); + } + + void MakeFeatureDifference(Parts *parts, + Features *features, + const vector &gold_output, + const vector &predicted_output, + FeatureVector *difference); + + void MakeGradientStep(Parts *parts, + Features *features, + double eta, + int iteration, + const vector &gold_output, + const vector &predicted_output); + + void TouchParameters(Parts *parts, Features *features, + const vector &selected_parts); + + void LabelInstance(Parts *parts, const vector &output, + Instance *instance); + + void Prune(Instance *instance, Parts *parts, vector *gold_outputs, + bool preserve_gold); + + virtual void BeginEvaluation() { + num_predicted_unlabeled_arcs_ = 0; + num_gold_unlabeled_arcs_ = 0; + num_matched_unlabeled_arcs_ = 0; + num_tokens_ = 0; + num_unlabeled_arcs_after_pruning_ = 0; + num_pruned_gold_unlabeled_arcs_ = 0; + num_possible_unlabeled_arcs_ = 0; + num_predicted_labeled_arcs_ = 0; + num_gold_labeled_arcs_ = 0; + num_matched_labeled_arcs_ = 0; + num_labeled_arcs_after_pruning_ = 0; + num_pruned_gold_labeled_arcs_ = 0; + num_possible_labeled_arcs_ = 0; + chrono.GetTime(); + } + virtual void EvaluateInstance(Instance *instance, + Instance *output_instance, + Parts *parts, + const vector &gold_outputs, + const vector &predicted_outputs) { + int num_possible_unlabeled_arcs = 0; + int num_possible_labeled_arcs = 0; + int num_gold_unlabeled_arcs = 0; + int num_gold_labeled_arcs = 0; + SemanticInstance *semantic_instance = + static_cast(instance); + SemanticParts *semantic_parts = static_cast(parts); + for (int p = 0; p < semantic_instance->size(); ++p) { + const vector &senses = semantic_parts->GetSenses(p); + for (int a = 1; a < semantic_instance->size(); ++a) { + for (int k = 0; k < senses.size(); ++k) { + int s = senses[k]; + int r = semantic_parts->FindArc(p, a, s); + if (r < 0) continue; + ++num_possible_unlabeled_arcs; + if (gold_outputs[r] >= 0.5) { + CHECK_EQ(gold_outputs[r], 1.0); + if (NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { + ++num_matched_unlabeled_arcs_; + } + ++num_gold_unlabeled_arcs; + } + if (predicted_outputs[r] >= 0.5) { + CHECK_EQ(predicted_outputs[r], 1.0); + ++num_predicted_unlabeled_arcs_; + + //LOG(INFO) << semantic_instance->GetForm(a) + // << " <-- " + // << semantic_instance->GetForm(p); + } + if (GetSemanticOptions()->labeled()) { + const vector &labeled_arcs = + semantic_parts->FindLabeledArcs(p, a, s); + for (int k = 0; k < labeled_arcs.size(); ++k) { + int r = labeled_arcs[k]; + if (r < 0) continue; + ++num_possible_labeled_arcs; + if (gold_outputs[r] >= 0.5) { + CHECK_EQ(gold_outputs[r], 1.0); + if (NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { + ++num_matched_labeled_arcs_; + + //LOG(INFO) << semantic_instance->GetForm(a) + // << " <-*- " + // << semantic_instance->GetForm(p); + } + ++num_gold_labeled_arcs; + } + if (predicted_outputs[r] >= 0.5) { + CHECK_EQ(predicted_outputs[r], 1.0); + ++num_predicted_labeled_arcs_; + } + } + } + } + } + + ++num_tokens_; + num_unlabeled_arcs_after_pruning_ += num_possible_unlabeled_arcs; + num_labeled_arcs_after_pruning_ += num_possible_labeled_arcs; + } + + int num_actual_gold_arcs = 0; + for (int k = 0; k < semantic_instance->GetNumPredicates(); ++k) { + num_actual_gold_arcs += + semantic_instance->GetNumArgumentsPredicate(k); + } + num_gold_unlabeled_arcs_ += num_actual_gold_arcs; + num_gold_labeled_arcs_ += num_actual_gold_arcs; + int missed_unlabeled = num_actual_gold_arcs - num_gold_unlabeled_arcs; + int missed_labeled = num_actual_gold_arcs - num_gold_labeled_arcs; + //if (missed > 0) { + // LOG(INFO) << "Missed " << missed << " unlabeled arcs."; + //} + num_pruned_gold_unlabeled_arcs_ += missed_unlabeled; + num_possible_unlabeled_arcs_ += num_possible_unlabeled_arcs; + num_pruned_gold_labeled_arcs_ += missed_labeled; + num_possible_labeled_arcs_ += num_possible_labeled_arcs; + } + + virtual void EndEvaluation() { + double unlabeled_precision = + static_cast(num_matched_unlabeled_arcs_) / + static_cast(num_predicted_unlabeled_arcs_); + double unlabeled_recall = + static_cast(num_matched_unlabeled_arcs_) / + static_cast(num_gold_unlabeled_arcs_); + double unlabeled_F1 = 2.0 * unlabeled_precision * unlabeled_recall / + (unlabeled_precision + unlabeled_recall); + double pruning_unlabeled_recall = + static_cast(num_gold_unlabeled_arcs_ - + num_pruned_gold_unlabeled_arcs_) / + static_cast(num_gold_unlabeled_arcs_); + double pruning_unlabeled_efficiency = + static_cast(num_possible_unlabeled_arcs_) / + static_cast(num_tokens_); + + double labeled_precision = + static_cast(num_matched_labeled_arcs_) / + static_cast(num_predicted_labeled_arcs_); + double labeled_recall = + static_cast(num_matched_labeled_arcs_) / + static_cast(num_gold_labeled_arcs_); + double labeled_F1 = 2.0 * labeled_precision * labeled_recall / + (labeled_precision + labeled_recall); + double pruning_labeled_recall = + static_cast(num_gold_labeled_arcs_ - + num_pruned_gold_labeled_arcs_) / + static_cast(num_gold_labeled_arcs_); + double pruning_labeled_efficiency = + static_cast(num_possible_labeled_arcs_) / + static_cast(num_tokens_); + + LOG(INFO) << "Unlabeled precision: " << unlabeled_precision + << " (" << num_matched_unlabeled_arcs_ << "/" + << num_predicted_unlabeled_arcs_ << ")"; + LOG(INFO) << "Unlabeled recall: " << unlabeled_recall + << " (" << num_matched_unlabeled_arcs_ << "/" + << num_gold_unlabeled_arcs_ << ")"; + LOG(INFO) << "Unlabeled F1: " << unlabeled_F1; + LOG(INFO) << "Pruning unlabeled recall: " << pruning_unlabeled_recall + << " (" + << num_gold_unlabeled_arcs_ - num_pruned_gold_unlabeled_arcs_ + << "/" + << num_gold_unlabeled_arcs_ << ")"; + LOG(INFO) << "Pruning unlabeled efficiency: " << pruning_unlabeled_efficiency + << " possible unlabeled arcs per token" + << " (" << num_possible_unlabeled_arcs_ << "/" + << num_tokens_ << ")"; + + LOG(INFO) << "Labeled precision: " << labeled_precision + << " (" << num_matched_labeled_arcs_ << "/" + << num_predicted_labeled_arcs_ << ")"; + LOG(INFO) << "Labeled recall: " << labeled_recall + << " (" << num_matched_labeled_arcs_ << "/" + << num_gold_labeled_arcs_ << ")"; + LOG(INFO) << "Labeled F1: " << labeled_F1; + LOG(INFO) << "Pruning labeled recall: " << pruning_labeled_recall + << " (" + << num_gold_labeled_arcs_ - num_pruned_gold_labeled_arcs_ + << "/" + << num_gold_labeled_arcs_ << ")"; + LOG(INFO) << "Pruning labeled efficiency: " << pruning_labeled_efficiency + << " possible labeled arcs per token" + << " (" << num_possible_labeled_arcs_ << "/" + << num_tokens_ << ")"; + + chrono.StopTime(); + double num_seconds = chrono.GetElapsedTime(); + double tokens_per_second = static_cast(num_tokens_) / num_seconds; + LOG(INFO) << "Speed: " + << tokens_per_second << " tokens per second."; + } + +#if 0 + void GetAllAncestors(const vector &heads, + int descend, + vector* ancestors); + bool ExistsPath(const vector &heads, + int ancest, + int descend); +#endif +protected: + TokenDictionary *token_dictionary_; + DependencyDictionary *dependency_dictionary_; + bool train_pruner_; + Parameters *pruner_parameters_; + int num_predicted_unlabeled_arcs_; + int num_gold_unlabeled_arcs_; + int num_matched_unlabeled_arcs_; + int num_tokens_; + int num_unlabeled_arcs_after_pruning_; + int num_pruned_gold_unlabeled_arcs_; + int num_possible_unlabeled_arcs_; + int num_predicted_labeled_arcs_; + int num_gold_labeled_arcs_; + int num_matched_labeled_arcs_; + int num_labeled_arcs_after_pruning_; + int num_pruned_gold_labeled_arcs_; + int num_possible_labeled_arcs_; + chronowrap::Chronometer chrono; +}; + +#endif /* SEMANTICPIPE_H_ */ diff --git a/src/semantic_parser/SemanticPredicate.h b/src/semantic_parser/SemanticPredicate.h index 403ab50..c2a62fa 100644 --- a/src/semantic_parser/SemanticPredicate.h +++ b/src/semantic_parser/SemanticPredicate.h @@ -1,82 +1,82 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEMANTICPREDICATE_H_ -#define SEMANTICPREDICATE_H_ - -#include "SerializationUtils.h" -#include - -class SemanticPredicate { -public: - SemanticPredicate() {} - SemanticPredicate(int id) { id_ = id; } - virtual ~SemanticPredicate() { roles_.clear(); } - -public: - int id() const { return id_; } - const std::set &GetRoles() const { - return roles_; - } - - bool HasRole(int role) const { - std::set::iterator it = roles_.find(role); - return (it != roles_.end()); - } - - void InsertRole(int role) { - CHECK(!HasRole(role)) << "Role existed already."; - roles_.insert(role); - } - - void Save(FILE *fs) { - bool success; - int length = roles_.size(); - success = WriteInteger(fs, id_); - CHECK(success); - success = WriteInteger(fs, length); - CHECK(success); - for (std::set::iterator it = roles_.begin(); - it != roles_.end(); ++it) { - int label = *it; - success = WriteInteger(fs, label); - CHECK(success); - } - } - - void Load(FILE *fs) { - bool success; - int length; - success = ReadInteger(fs, &id_); - CHECK(success); - success = ReadInteger(fs, &length); - CHECK(success); - for (int i = 0; i < length; ++i) { - int label; - success = ReadInteger(fs, &label); - CHECK(success); - InsertRole(label); - } - } - -protected: - int id_; - std::set roles_; -}; - -#endif /* SEMANTICPREDICATE_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEMANTICPREDICATE_H_ +#define SEMANTICPREDICATE_H_ + +#include "SerializationUtils.h" +#include + +class SemanticPredicate { +public: + SemanticPredicate() {} + SemanticPredicate(int id) { id_ = id; } + virtual ~SemanticPredicate() { roles_.clear(); } + +public: + int id() const { return id_; } + const std::set &GetRoles() const { + return roles_; + } + + bool HasRole(int role) const { + std::set::iterator it = roles_.find(role); + return (it != roles_.end()); + } + + void InsertRole(int role) { + CHECK(!HasRole(role)) << "Role existed already."; + roles_.insert(role); + } + + void Save(FILE *fs) { + bool success; + int length = (int)roles_.size(); + success = WriteInteger(fs, id_); + CHECK(success); + success = WriteInteger(fs, length); + CHECK(success); + for (std::set::iterator it = roles_.begin(); + it != roles_.end(); ++it) { + int label = *it; + success = WriteInteger(fs, label); + CHECK(success); + } + } + + void Load(FILE *fs) { + bool success; + int length; + success = ReadInteger(fs, &id_); + CHECK(success); + success = ReadInteger(fs, &length); + CHECK(success); + for (int i = 0; i < length; ++i) { + int label; + success = ReadInteger(fs, &label); + CHECK(success); + InsertRole(label); + } + } + +protected: + int id_; + std::set roles_; +}; + +#endif /* SEMANTICPREDICATE_H_ */ diff --git a/src/semantic_parser/SemanticReader.cpp b/src/semantic_parser/SemanticReader.cpp index d3d67a7..ab61347 100644 --- a/src/semantic_parser/SemanticReader.cpp +++ b/src/semantic_parser/SemanticReader.cpp @@ -135,17 +135,20 @@ Instance *SemanticReader::GetNext() { string top_name = info[offset]; ++offset; CHECK(0 == top_name.compare("-") || 0 == top_name.compare("+")); - if (0 == top_name.compare("+")) is_top = true; + if (0 == top_name.compare("+")) + is_top = true; string predicate_flag = info[offset]; ++offset; CHECK(0 == predicate_flag.compare("-") || 0 == predicate_flag.compare("+")); - if (0 == predicate_flag.compare("+")) is_predicate = true; + if (0 == predicate_flag.compare("+")) + is_predicate = true; } string predicate_name = info[offset]; ++offset; if (!use_sdp_format_) { - if (0 != predicate_name.compare("_")) is_predicate = true; + if (0 != predicate_name.compare("_")) + is_predicate = true; } if (!use_sdp_format_) CHECK_EQ(offset, 11); if (i == 0) { diff --git a/src/semantic_parser/SemanticWriter.cpp b/src/semantic_parser/SemanticWriter.cpp index 51e692d..39581d0 100644 --- a/src/semantic_parser/SemanticWriter.cpp +++ b/src/semantic_parser/SemanticWriter.cpp @@ -127,3 +127,5 @@ void SemanticWriter::Write(Instance *instance) { } os_ << endl; } + +void SemanticWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} \ No newline at end of file diff --git a/src/semantic_parser/SemanticWriter.h b/src/semantic_parser/SemanticWriter.h index 54381bc..cf3d83a 100644 --- a/src/semantic_parser/SemanticWriter.h +++ b/src/semantic_parser/SemanticWriter.h @@ -49,6 +49,7 @@ class SemanticWriter : public DependencyWriter { } } void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); protected: Options *options_; diff --git a/src/semantic_parser/TurboSemanticParser.cpp b/src/semantic_parser/TurboSemanticParser.cpp index b4baaed..1755df5 100644 --- a/src/semantic_parser/TurboSemanticParser.cpp +++ b/src/semantic_parser/TurboSemanticParser.cpp @@ -1,129 +1,127 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "SemanticPipe.h" - -using namespace std; - -void TrainSemanticParser(); -void TestSemanticParser(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_train) { - LOG(INFO) << "Training semantic parser..." << endl; - TrainSemanticParser(); - } else if (FLAGS_test) { - LOG(INFO) << "Running semantic parser..." << endl; - TestSemanticParser(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainSemanticParser() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - SemanticOptions *options = new SemanticOptions; - options->Initialize(); - - SemanticPipe *pipe = new SemanticPipe(options); - pipe->Initialize(); - - if (options->prune_basic()) { - if (options->use_pretrained_pruner()) { - pipe->LoadPrunerModelFile(); - } else { - // Train the pruner. - LOG(INFO) << "Training the pruner..."; - SemanticOptions *pruner_options = new SemanticOptions; - *pruner_options = *options; - // Transform things such as pruner_train_algorithm - // in train_algorithm. - pruner_options->CopyPrunerFlags(); - pruner_options->Initialize(); - SemanticPipe *pruner_pipe = new SemanticPipe(pruner_options); - pruner_pipe->Initialize(); - - pruner_pipe->Train(); - pipe->SetPrunerParameters(pruner_pipe->GetParameters()); - // This is necessary so that the pruner parameters are not - // destroyed when deleting the pruner pipe. - pruner_pipe->SetParameters(NULL); - - delete pruner_pipe; - delete pruner_options; - } - } - - LOG(INFO) << "Training the semantic parser..."; - pipe->Train(); - pipe->SaveModelFile(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; -} - -void TestSemanticParser() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - SemanticOptions *options = new SemanticOptions; - options->Initialize(); - - SemanticPipe *pipe = new SemanticPipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - pipe->Run(); - - delete pipe; - delete options; - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "SemanticPipe.h" + +using namespace std; + +void TrainSemanticParser(); +void TestSemanticParser(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_train) { + LOG(INFO) << "Training semantic parser..." << endl; + TrainSemanticParser(); + } else if (FLAGS_test) { + LOG(INFO) << "Running semantic parser..." << endl; + TestSemanticParser(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainSemanticParser() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + SemanticOptions *options = new SemanticOptions; + options->Initialize(); + + SemanticPipe *pipe = new SemanticPipe(options); + pipe->Initialize(); + + if (options->prune_basic()) { + if (options->use_pretrained_pruner()) { + pipe->LoadPrunerModelFile(); + } else { + // Train the pruner. + LOG(INFO) << "Training the pruner..."; + SemanticOptions *pruner_options = new SemanticOptions; + *pruner_options = *options; + // Transform things such as pruner_train_algorithm + // in train_algorithm. + pruner_options->CopyPrunerFlags(); + pruner_options->Initialize(); + SemanticPipe *pruner_pipe = new SemanticPipe(pruner_options); + pruner_pipe->Initialize(); + + pruner_pipe->Train(); + pipe->SetPrunerParameters(pruner_pipe->GetParameters()); + // This is necessary so that the pruner parameters are not + // destroyed when deleting the pruner pipe. + pruner_pipe->SetParameters(NULL); + + delete pruner_pipe; + delete pruner_options; + } + } + + LOG(INFO) << "Training the semantic parser..."; + pipe->Train(); + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestSemanticParser() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + SemanticOptions *options = new SemanticOptions; + options->Initialize(); + + SemanticPipe *pipe = new SemanticPipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/sequence/SequenceDecoder.cpp b/src/sequence/SequenceDecoder.cpp index dc02224..61ab51b 100644 --- a/src/sequence/SequenceDecoder.cpp +++ b/src/sequence/SequenceDecoder.cpp @@ -21,6 +21,11 @@ #include "SequencePipe.h" #include // Remove this. +//DEFINE_double(ner_train_cost_false_positives, 0.5, +// "Cost for 'false positives' -- penalises recall and favours precision in BIO tagging."); +//DEFINE_double(ner_train_cost_false_negatives, 0.5, +// "Cost for 'false negatives' -- penalises precision and favours recall in BIO tagging."); + void SequenceDecoder::DecodeCostAugmented(Instance *instance, Parts *parts, const vector &scores, const vector &gold_output, @@ -32,6 +37,21 @@ void SequenceDecoder::DecodeCostAugmented(Instance *instance, Parts *parts, sequence_parts->GetOffsetUnigram(&offset_unigrams, &num_unigrams); + //////////////////////////////////////////////////// + // F1: a = 0.5, b = 0.5. + // Recall: a = 0, b = 1. + // In general: + // p = a - (a+b)*z0 + // q = b*sum(z0) + // p'*z + q = a*sum(z) - (a+b)*z0'*z + b*sum(z0) + // = a*(1-z0)'*z + b*(1-z)'*z0. + //////////////////////////////////////////////////// + + // Penalty for predicting 1 when it is 0 (FP). + // double a = FLAGS_ner_train_cost_false_positives; + // Penalty for predicting 0 when it is 1 (FN). + // double b = FLAGS_ner_train_cost_false_negatives; + // p = 0.5-z0, q = 0.5'*z0, loss = p'*z + q double q = 0.0; vector p(num_unigrams, 0.0); @@ -67,8 +87,11 @@ void SequenceDecoder::Decode(Instance *instance, Parts *parts, SequenceParts *sequence_parts = static_cast(parts); int offset, size; - vector node_scores(sentence->size()); - vector edge_scores(sentence->size() - 1); + vector & node_scores = sentence->node_scores_; + vector & edge_scores = sentence->edge_scores_; + node_scores.resize(sentence->size()); + edge_scores.resize(sentence->size() - 1); + // The triplets are represented as if they were edges connecting // nodes with bigram states. vector triplet_scores; diff --git a/src/sequence/SequenceDecoder.h b/src/sequence/SequenceDecoder.h index d1d88eb..68bc96c 100644 --- a/src/sequence/SequenceDecoder.h +++ b/src/sequence/SequenceDecoder.h @@ -29,7 +29,7 @@ class SequenceDecoderNodeScores { virtual ~SequenceDecoderNodeScores() {} // Get the number of states. - int GetNumStates() const { return scores_.size(); } + int GetNumStates() const { return (int) scores_.size(); } // Set the number of states. void SetNumStates(int num_states) { scores_.resize(num_states); } @@ -76,14 +76,14 @@ class SequenceDecoderEdgeScores { // Get/Set the number of states for the current node. The states must be // numbered 0, 1, 2, ... - int GetNumCurrentStates() const { return scores_.size(); } + int GetNumCurrentStates() const { return (int) scores_.size(); } void SetNumCurrentStates(int num_current_states) { scores_.resize(num_current_states); } // Get/set the number of previous states compatible with the current node. int GetNumPreviousStates(int current_state) const { - return scores_[current_state].size(); + return (int) scores_[current_state].size(); } void SetNumPreviousStates(int current_state, int num_previous_states) { scores_[current_state].resize(num_previous_states); @@ -141,13 +141,13 @@ class SequenceDecoderEdgeScores { int GetStatePairIndex(int current_state, int k) const { int bigram_index = k; for (int state = 0; state < current_state; ++state) { - bigram_index += scores_[state].size(); + bigram_index += (int)scores_[state].size(); } return bigram_index; } // Compute number of bigram indices. - int GetNumStatePairs() const { return GetStatePairIndex(scores_.size(), 0); } + int GetNumStatePairs() const { return GetStatePairIndex((int)scores_.size(), 0); } private: std::vector > > scores_; @@ -159,11 +159,11 @@ class SequenceDecoder : public Decoder { SequenceDecoder(SequencePipe *pipe) : pipe_(pipe) {}; virtual ~SequenceDecoder() {}; - void Decode(Instance *instance, Parts *parts, + virtual void Decode(Instance *instance, Parts *parts, const vector &scores, vector *predicted_output); - void DecodeCostAugmented(Instance *instance, Parts *parts, + virtual void DecodeCostAugmented(Instance *instance, Parts *parts, const vector &scores, const vector &gold_output, vector *predicted_output, diff --git a/src/sequence/SequenceInstance.h b/src/sequence/SequenceInstance.h index c8298b9..6ea5734 100644 --- a/src/sequence/SequenceInstance.h +++ b/src/sequence/SequenceInstance.h @@ -1,54 +1,54 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEQUENCEINSTANCE_H_ -#define SEQUENCEINSTANCE_H_ - -#include -#include -#include "Instance.h" - -class SequenceInstance : public Instance { -public: - SequenceInstance() {} - virtual ~SequenceInstance() {} - - virtual Instance* Copy() { - SequenceInstance* instance = new SequenceInstance(); - instance->Initialize(forms_, tags_); - return static_cast(instance); - } - - virtual void Initialize(const std::vector &forms, - const std::vector &tags); - - int size() const { return forms_.size(); }; - - const std::string &GetForm(int i) const { return forms_[i]; } - const std::string &GetTag(int i) const { return tags_[i]; } - const std::vector &forms() const { return forms_; } - const std::vector &tags() const { return tags_; } - - void SetTag(int i, const std::string &tag) { tags_[i] = tag; } - -protected: - std::vector forms_; - std::vector tags_; -}; - -#endif /* SEQUENCEINSTANCE_H_*/ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEQUENCEINSTANCE_H_ +#define SEQUENCEINSTANCE_H_ + +#include +#include +#include "Instance.h" + +class SequenceInstance : public Instance { +public: + SequenceInstance() {} + virtual ~SequenceInstance() {} + + virtual Instance* Copy() { + SequenceInstance* instance = new SequenceInstance(); + instance->Initialize(forms_, tags_); + return static_cast(instance); + } + + virtual void Initialize(const std::vector &forms, + const std::vector &tags); + + int size() const { return (int) forms_.size(); }; + + const std::string &GetForm(int i) const { return forms_[i]; } + const std::string &GetTag(int i) const { return tags_[i]; } + const std::vector &forms() const { return forms_; } + const std::vector &tags() const { return tags_; } + + void SetTag(int i, const std::string &tag) { tags_[i] = tag; } + +protected: + std::vector forms_; + std::vector tags_; +}; + +#endif /* SEQUENCEINSTANCE_H_*/ diff --git a/src/sequence/SequenceInstanceNumeric.cpp b/src/sequence/SequenceInstanceNumeric.cpp index 6817424..2334b7b 100644 --- a/src/sequence/SequenceInstanceNumeric.cpp +++ b/src/sequence/SequenceInstanceNumeric.cpp @@ -52,7 +52,7 @@ void SequenceInstanceNumeric::Initialize(const SequenceDictionary &dictionary, for (i = 0; i < length; i++) { std::string form = instance->GetForm(i); if (!form_case_sensitive) { - transform(form.begin(), form.end(), form.begin(), ::tolower); + std::transform(form.begin(), form.end(), form.begin(), ::tolower); } id = token_dictionary->GetFormId(form); CHECK_LT(id, 0xffff); @@ -106,7 +106,7 @@ void SequenceInstanceNumeric::Initialize(const SequenceDictionary &dictionary, //CHECK_GE(id, 0); if (id < 0) { id = TOKEN_UNKNOWN; - LOG(INFO) << "Unknown tag: " << instance->GetTag(i); + //LOG(INFO) << "Unknown tag: " << instance->GetTag(i); } tag_ids_[i] = id; } @@ -120,4 +120,4 @@ bool SequenceInstanceNumeric::IsPunctuation(char c) { } return false; //return (c == '\'') || (c == '-') || (c == '&') || (c == '/'); -} +} diff --git a/src/sequence/SequenceInstanceNumeric.h b/src/sequence/SequenceInstanceNumeric.h index ddaf257..0ba63c0 100644 --- a/src/sequence/SequenceInstanceNumeric.h +++ b/src/sequence/SequenceInstanceNumeric.h @@ -1,192 +1,199 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef SEQUENCEINSTANCENUMERIC_H_ -#define SEQUENCEINSTANCENUMERIC_H_ - +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef SEQUENCEINSTANCENUMERIC_H_ +#define SEQUENCEINSTANCENUMERIC_H_ + #include "SequenceInstance.h" -#include "SequenceDictionary.h" -#include -#include - -class SequenceInstanceNumeric : public Instance { -public: - SequenceInstanceNumeric() {}; - virtual ~SequenceInstanceNumeric() { Clear(); }; - - Instance* Copy() { - CHECK(false) << "Not implemented."; - return NULL; - } - - int size() { return form_ids_.size(); }; - - virtual void Clear() { - form_ids_.clear(); - prefix_ids_.clear(); - suffix_ids_.clear(); - shape_ids_.clear(); - has_digit_.clear(); - has_upper_.clear(); - has_hyphen_.clear(); - all_digits_.clear(); - all_digits_with_punctuation_.clear(); - all_upper_.clear(); - first_upper_.clear(); - tag_ids_.clear(); - } - - void Initialize(const SequenceDictionary &dictionary, - SequenceInstance *instance); - - const std::vector &GetFormIds() const { return form_ids_; } - const std::vector &GetTagIds() const { return tag_ids_; } - - int GetFormId(int i) { return form_ids_[i]; } - int GetMaxPrefixLength(int i) { return prefix_ids_[i].size(); } - int GetMaxSuffixLength(int i) { return suffix_ids_[i].size(); } - int GetPrefixId(int i, int length) { return prefix_ids_[i][length - 1]; } - int GetSuffixId(int i, int length) { return suffix_ids_[i][length - 1]; } - int GetShapeId(int i) { return shape_ids_[i]; } - bool HasDigit(int i) { return has_digit_[i]; } - bool HasUpper(int i) { return has_upper_[i]; } - bool HasHyphen(int i) { return has_hyphen_[i]; } - bool AllDigits(int i) { return all_digits_[i]; } - bool AllDigitsWithPunctuation(int i) { - return all_digits_with_punctuation_[i]; - } - bool AllUpper(int i) { return all_upper_[i]; } - bool FirstUpper(int i) { return first_upper_[i]; } - int GetTagId(int i) { return tag_ids_[i]; } - -protected: - bool IsUpperCase(char c) { return (c >= 'A' && c <= 'Z'); } - bool IsLowerCase(char c) { return (c >= 'a' && c <= 'z'); } - bool IsDigit(char c) { return (c >= '0' && c <= '9'); } - bool IsPeriod(char c) { return (c == '.'); } - bool IsPunctuation(char c); - - bool AllUpperCase(const char* word, int len) { - for (int i = 0; i < len; ++i) { - if (!IsUpperCase(word[i])) return false; - } - return true; - } - - bool AllLowerCase(const char* word, int len) { - for (int i = 0; i < len; ++i) { - if (!IsLowerCase(word[i])) return false; - } - return true; - } - - bool IsCapitalized(const char* word, int len) { - if (len <= 0) return false; - return IsUpperCase(word[0]); - } - - bool IsMixedCase(const char* word, int len) { - if (len <= 0) return false; - if (!IsLowerCase(word[0])) return false; - for (int i = 1; i < len; ++i) { - if (IsUpperCase(word[i])) return true; - } - return false; - } - - bool EndsWithPeriod(const char* word, int len) { - if (len <= 0) return false; - return IsPeriod(word[len - 1]); - } - - bool HasInternalPeriod(const char* word, int len) { - if (len <= 0) return false; - for (int i = 0; i < len - 1; ++i) { - if (IsPeriod(word[i])) return true; - } - return false; - } - - bool HasInternalPunctuation(const char* word, int len) { - if (len <= 0) return false; - for (int i = 0; i < len - 1; ++i) { - if (IsPunctuation(word[i])) return true; - } - return false; - } - - int CountDigits(const char* word, int len) { - int num_digits = 0; - for (int i = 0; i < len; ++i) { - if (IsDigit(word[i])) ++num_digits; - } - return num_digits; - } - - bool HasUpperCaseLetters(const char* word, int len) { - for (int i = 0; i < len; ++i) { - if (IsUpperCase(word[i])) return true; - } - return false; - } - - bool HasHyphen(const char* word, int len) { - for (int i = 0; i < len; ++i) { - if ('-' == word[i]) return true; - } - return false; - } - - bool AllDigits(const char* word, int len) { - for (int i = 0; i < len; ++i) { - if (!IsDigit(word[i])) return false; - } - return true; - } - - bool AllDigitsWithPunctuation(const char* word, int len) { - bool has_digits = false; - bool has_punctuation = false; - for (int i = 0; i < len; ++i) { - if (IsDigit(word[i])) { - has_digits = true; - } else if (IsPunctuation(word[i])) { - has_punctuation = true; - } else { - return false; - } - } - return has_digits && has_punctuation; - } - -private: - std::vector form_ids_; - std::vector > prefix_ids_; - std::vector > suffix_ids_; - std::vector shape_ids_; - std::vector has_digit_; - std::vector has_upper_; - std::vector has_hyphen_; - std::vector all_digits_; - std::vector all_digits_with_punctuation_; - std::vector all_upper_; - std::vector first_upper_; - std::vector tag_ids_; -}; - -#endif /* SEQUENCEINSTANCENUMERIC_H_ */ +#include "SequenceDecoder.h" +#include "SequenceDictionary.h" +#include +#include + +class SequenceInstanceNumeric : public Instance { +public: + SequenceInstanceNumeric() {}; + virtual ~SequenceInstanceNumeric() { Clear(); }; + + Instance* Copy() { + CHECK(false) << "Not implemented."; + return NULL; + } + + int size() { return (int) form_ids_.size(); }; + + virtual void Clear() { + form_ids_.clear(); + prefix_ids_.clear(); + suffix_ids_.clear(); + shape_ids_.clear(); + has_digit_.clear(); + has_upper_.clear(); + has_hyphen_.clear(); + all_digits_.clear(); + all_digits_with_punctuation_.clear(); + all_upper_.clear(); + first_upper_.clear(); + tag_ids_.clear(); + } + + void Initialize(const SequenceDictionary &dictionary, + SequenceInstance *instance); + + const std::vector &GetFormIds() const { return form_ids_; } + const std::vector &GetTagIds() const { return tag_ids_; } + + int GetFormId(int i) { return form_ids_[i]; } + int GetMaxPrefixLength(int i) { return (int) prefix_ids_[i].size(); } + int GetMaxSuffixLength(int i) { return (int) suffix_ids_[i].size(); } + int GetPrefixId(int i, int length) { return prefix_ids_[i][length - 1]; } + int GetSuffixId(int i, int length) { return suffix_ids_[i][length - 1]; } + int GetShapeId(int i) { return shape_ids_[i]; } + bool HasDigit(int i) { return has_digit_[i]; } + bool HasUpper(int i) { return has_upper_[i]; } + bool HasHyphen(int i) { return has_hyphen_[i]; } + bool AllDigits(int i) { return all_digits_[i]; } + bool AllDigitsWithPunctuation(int i) { + return all_digits_with_punctuation_[i]; + } + bool AllUpper(int i) { return all_upper_[i]; } + bool FirstUpper(int i) { return first_upper_[i]; } + int GetTagId(int i) { return tag_ids_[i]; } + +protected: + bool IsUpperCase(char c) { return (c >= 'A' && c <= 'Z'); } + bool IsLowerCase(char c) { return (c >= 'a' && c <= 'z'); } + bool IsDigit(char c) { return (c >= '0' && c <= '9'); } + bool IsPeriod(char c) { return (c == '.'); } + bool IsPunctuation(char c); + + bool AllUpperCase(const char* word, int len) { + for (int i = 0; i < len; ++i) { + if (!IsUpperCase(word[i])) return false; + } + return true; + } + + bool AllLowerCase(const char* word, int len) { + for (int i = 0; i < len; ++i) { + if (!IsLowerCase(word[i])) return false; + } + return true; + } + + bool IsCapitalized(const char* word, int len) { + if (len <= 0) return false; + return IsUpperCase(word[0]); + } + + bool IsMixedCase(const char* word, int len) { + if (len <= 0) return false; + if (!IsLowerCase(word[0])) return false; + for (int i = 1; i < len; ++i) { + if (IsUpperCase(word[i])) return true; + } + return false; + } + + bool EndsWithPeriod(const char* word, int len) { + if (len <= 0) return false; + return IsPeriod(word[len - 1]); + } + + bool HasInternalPeriod(const char* word, int len) { + if (len <= 0) return false; + for (int i = 0; i < len - 1; ++i) { + if (IsPeriod(word[i])) return true; + } + return false; + } + + bool HasInternalPunctuation(const char* word, int len) { + if (len <= 0) return false; + for (int i = 0; i < len - 1; ++i) { + if (IsPunctuation(word[i])) return true; + } + return false; + } + + int CountDigits(const char* word, int len) { + int num_digits = 0; + for (int i = 0; i < len; ++i) { + if (IsDigit(word[i])) ++num_digits; + } + return num_digits; + } + + bool HasUpperCaseLetters(const char* word, int len) { + for (int i = 0; i < len; ++i) { + if (IsUpperCase(word[i])) return true; + } + return false; + } + + bool HasHyphen(const char* word, int len) { + for (int i = 0; i < len; ++i) { + if ('-' == word[i]) return true; + } + return false; + } + + bool AllDigits(const char* word, int len) { + for (int i = 0; i < len; ++i) { + if (!IsDigit(word[i])) return false; + } + return true; + } + + bool AllDigitsWithPunctuation(const char* word, int len) { + bool has_digits = false; + bool has_punctuation = false; + for (int i = 0; i < len; ++i) { + if (IsDigit(word[i])) { + has_digits = true; + } else if (IsPunctuation(word[i])) { + has_punctuation = true; + } else { + return false; + } + } + return has_digits && has_punctuation; + } + +private: + std::vector form_ids_; + std::vector > prefix_ids_; + std::vector > suffix_ids_; + std::vector shape_ids_; + std::vector has_digit_; + std::vector has_upper_; + std::vector has_hyphen_; + std::vector all_digits_; + std::vector all_digits_with_punctuation_; + std::vector all_upper_; + std::vector first_upper_; + std::vector tag_ids_; + + public: + //expose Viterbi matrixes (node & edge scores) + vector node_scores_; + vector edge_scores_; + +}; + +#endif /* SEQUENCEINSTANCENUMERIC_H_ */ diff --git a/src/sequence/SequenceOptions.cpp b/src/sequence/SequenceOptions.cpp index 21959f2..17b514c 100644 --- a/src/sequence/SequenceOptions.cpp +++ b/src/sequence/SequenceOptions.cpp @@ -30,6 +30,9 @@ DEFINE_int32(sequence_model_type, 2, //DEFINE_bool(sequence_prune_tags, true, // "True for pruning the set of possible tags by using a dictionary."); +DEFINE_bool(expose_node_edge_viterbi_scores, false, + "Output to another file the node and edge scores of the decode step."); + // Save current option flags to the model file. void SequenceOptions::Save(FILE* fs) { Options::Save(fs); @@ -77,4 +80,5 @@ void SequenceOptions::Initialize() { //large_feature_set_ = FLAGS_tagger_large_feature_set; //prune_tags_ = FLAGS_sequence_prune_tags; //file_unknown_word_tags_ = FLAGS_file_unknown_word_tags; + expose_node_edge_viterbi_scores_ = FLAGS_expose_node_edge_viterbi_scores; } diff --git a/src/sequence/SequenceOptions.h b/src/sequence/SequenceOptions.h index b013b52..97fc8e1 100644 --- a/src/sequence/SequenceOptions.h +++ b/src/sequence/SequenceOptions.h @@ -35,9 +35,11 @@ class SequenceOptions : public Options { // Get option flags. int markov_order() { return model_type_; } + bool expose_node_edge_viterbi_scores() { return expose_node_edge_viterbi_scores_; } protected: int model_type_; + bool expose_node_edge_viterbi_scores_; }; #endif // SEQUENCE_OPTIONS_H_ diff --git a/src/sequence/SequencePart.h b/src/sequence/SequencePart.h index d57e273..81f5c9b 100644 --- a/src/sequence/SequencePart.h +++ b/src/sequence/SequencePart.h @@ -147,7 +147,7 @@ class SequenceParts : public Parts { void BuildOffsets() { for (int i = NUM_SEQUENCEPARTS - 1; i >= 0; --i) { if (offsets_[i] < 0) { - offsets_[i] = (i == NUM_SEQUENCEPARTS - 1) ? size() : offsets_[i + 1]; + offsets_[i] = (i == NUM_SEQUENCEPARTS - 1) ? (int)size() : offsets_[i + 1]; } } }; @@ -175,7 +175,7 @@ class SequenceParts : public Parts { void GetOffset(int i, int *offset, int *size) const { *offset = offsets_[i]; *size = (i < NUM_SEQUENCEPARTS - 1) ? offsets_[i + 1] - (*offset) : - SequenceParts::size() - (*offset); + (int)SequenceParts::size() - (*offset); } // Set offset from part index. diff --git a/src/sequence/SequencePipe.cpp b/src/sequence/SequencePipe.cpp index 0bdba2a..8638c20 100644 --- a/src/sequence/SequencePipe.cpp +++ b/src/sequence/SequencePipe.cpp @@ -30,7 +30,7 @@ using namespace std; // Define the current model version and the oldest back-compatible version. // The format is AAAA.BBBB.CCCC, e.g., 2 0003 0000 means "2.3.0". -const uint64_t kSequenceModelVersion = 200030000; +const uint64_t kSequenceModelVersion = 200030001; const uint64_t kOldestCompatibleSequenceModelVersion = 200030000; const uint64_t kSequenceModelCheck = 1234567890; @@ -46,15 +46,13 @@ void SequencePipe::SaveModel(FILE* fs) { void SequencePipe::LoadModel(FILE* fs) { bool success; - uint64_t model_check; - uint64_t model_version; - success = ReadUINT64(fs, &model_check); + success = ReadUINT64(fs, &model_check_); CHECK(success); - CHECK_EQ(model_check, kSequenceModelCheck) + CHECK_EQ(model_check_, kSequenceModelCheck) << "The model file is too old and not supported anymore."; - success = ReadUINT64(fs, &model_version); + success = ReadUINT64(fs, &model_version_); CHECK(success); - CHECK_GE(model_version, kOldestCompatibleSequenceModelVersion) + CHECK_GE(model_version_, kOldestCompatibleSequenceModelVersion) << "The model file is too old and not supported anymore."; delete token_dictionary_; CreateTokenDictionary(); @@ -74,7 +72,8 @@ void SequencePipe::PreprocessData() { CreateTagDictionary(GetSequenceReader()); } -void SequencePipe::ComputeScores(Instance *instance, Parts *parts, +void SequencePipe::ComputeScores(Instance *instance, + Parts *parts, Features *features, vector *scores) { SequenceInstanceNumeric *sentence = @@ -100,15 +99,9 @@ void SequencePipe::ComputeScores(Instance *instance, Parts *parts, allowed_tags[k] = unigram->tag(); } vector tag_scores; -#if USE_WEIGHT_CACHING == 1 - parameters_->ComputeLabelScoresWithCache(unigram_features, - allowed_tags, - &tag_scores); -#else parameters_->ComputeLabelScores(unigram_features, allowed_tags, &tag_scores); -#endif for (int k = 0; k < index_unigram_parts.size(); ++k) { (*scores)[index_unigram_parts[k]] = tag_scores[k]; } @@ -131,15 +124,9 @@ void SequencePipe::ComputeScores(Instance *instance, Parts *parts, } vector tag_scores; -#if USE_WEIGHT_CACHING == 1 - parameters_->ComputeLabelScoresWithCache(bigram_features, - bigram_tags, - &tag_scores); -#else parameters_->ComputeLabelScores(bigram_features, bigram_tags, &tag_scores); -#endif for (int k = 0; k < index_bigram_parts.size(); ++k) { (*scores)[index_bigram_parts[k]] = tag_scores[k]; } @@ -165,15 +152,9 @@ void SequencePipe::ComputeScores(Instance *instance, Parts *parts, } vector tag_scores; -#if USE_WEIGHT_CACHING == 1 - parameters_->ComputeLabelScoresWithCache(trigram_features, - trigram_tags, - &tag_scores); -#else parameters_->ComputeLabelScores(trigram_features, trigram_tags, &tag_scores); -#endif for (int k = 0; k < index_trigram_parts.size(); ++k) { (*scores)[index_trigram_parts[k]] = tag_scores[k]; } @@ -293,6 +274,8 @@ void SequencePipe::MakeParts(Instance *instance, vector *gold_outputs) { int sentence_length = static_cast(instance)->size(); + if (sentence_length == 0) + return; SequenceParts *sequence_parts = static_cast(parts); sequence_parts->Initialize(); bool make_gold = (gold_outputs != NULL); @@ -615,4 +598,4 @@ void SequencePipe::LabelInstance(Parts *parts, const vector &output, for (int i = 0; i < instance_length; ++i) { CHECK(sequence_instance->GetTag(i) != "NULL"); } -} +} diff --git a/src/sequence/SequencePipe.h b/src/sequence/SequencePipe.h index 72a0d58..2ee5d6e 100644 --- a/src/sequence/SequencePipe.h +++ b/src/sequence/SequencePipe.h @@ -20,6 +20,7 @@ #define SEQUENCEPIPE_H_ #include "Pipe.h" +#include "TimeUtils.h" #include "SequenceOptions.h" #include "SequenceReader.h" #include "SequenceDictionary.h" @@ -117,7 +118,7 @@ class SequencePipe : public Pipe { virtual void BeginEvaluation() { num_tag_mistakes_ = 0; num_tokens_ = 0; - gettimeofday(&start_clock_, NULL); + chrono.GetTime(); } virtual void EvaluateInstance(Instance *instance, Instance *output_instance, @@ -145,10 +146,8 @@ class SequencePipe : public Pipe { LOG(INFO) << "Tagging accuracy: " << static_cast(num_tokens_ - num_tag_mistakes_) / static_cast(num_tokens_); - timeval end_clock; - gettimeofday(&end_clock, NULL); - double num_seconds = - static_cast(diff_ms(end_clock, start_clock_)) / 1000.0; + chrono.StopTime(); + double num_seconds = chrono.GetElapsedTime(); double tokens_per_second = static_cast(num_tokens_) / num_seconds; LOG(INFO) << "Tagging speed: " << tokens_per_second << " tokens per second."; @@ -158,7 +157,7 @@ class SequencePipe : public Pipe { TokenDictionary *token_dictionary_; int num_tag_mistakes_; int num_tokens_; - timeval start_clock_; + chronowrap::Chronometer chrono; }; #endif /* SEQUENCEPIPE_H_ */ diff --git a/src/sequence/SequenceReader.cpp b/src/sequence/SequenceReader.cpp index badf78a..c342202 100644 --- a/src/sequence/SequenceReader.cpp +++ b/src/sequence/SequenceReader.cpp @@ -1,61 +1,61 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "SequenceReader.h" -#include "Utils.h" -#include -#include - -using namespace std; - -Instance *SequenceReader::GetNext() { - // Fill all fields for the entire sentence. - vector > sentence_fields; - string line; - if (is_.is_open()) { - while (!is_.eof()) { - getline(is_, line); - if (line.length() <= 0) break; - vector fields; - StringSplit(line, "\t", &fields, true); - sentence_fields.push_back(fields); - } - } - - // Sentence length. - int length = sentence_fields.size(); - - // Convert to array of words and tags. - vector forms(length); - vector tags(length); - - for (int i = 0; i < length; ++i) { - const vector &info = sentence_fields[i]; - CHECK_EQ(info.size(), 2); - forms[i] = info[0]; - tags[i] = info[1]; - } - - SequenceInstance *instance = NULL; - if (length > 0) { - instance = new SequenceInstance; - instance->Initialize(forms, tags); - } - - return static_cast(instance); -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "SequenceReader.h" +#include "Utils.h" +#include +#include + +using namespace std; + +Instance *SequenceReader::GetNext() { + // Fill all fields for the entire sentence. + vector > sentence_fields; + string line; + if (is_.is_open()) { + while (!is_.eof()) { + getline(is_, line); + if (line.length() <= 0) break; + vector fields; + StringSplit(line, "\t", &fields, true); + sentence_fields.push_back(fields); + } + } + + // Sentence length. + int length = sentence_fields.size(); + + // Convert to array of words and tags. + vector forms(length); + vector tags(length); + + for (int i = 0; i < length; ++i) { + const vector &info = sentence_fields[i]; + CHECK_EQ(info.size(), 2); + forms[i] = info[0]; + tags[i] = info[1]; + } + + SequenceInstance *instance = NULL; + if (length > 0) { + instance = new SequenceInstance; + instance->Initialize(forms, tags); + } + + return static_cast(instance); +} diff --git a/src/sequence/SequenceWriter.cpp b/src/sequence/SequenceWriter.cpp index c33a0da..5fcd3df 100644 --- a/src/sequence/SequenceWriter.cpp +++ b/src/sequence/SequenceWriter.cpp @@ -31,3 +31,6 @@ void SequenceWriter::Write(Instance *instance) { } os_ << endl; } + +void SequenceWriter::WriteFormatted(Pipe * pipe, Instance *instance) {} + diff --git a/src/sequence/SequenceWriter.h b/src/sequence/SequenceWriter.h index ff6c42e..2091fd4 100644 --- a/src/sequence/SequenceWriter.h +++ b/src/sequence/SequenceWriter.h @@ -28,6 +28,7 @@ class SequenceWriter : public Writer { public: void Write(Instance *instance); + void WriteFormatted(Pipe * pipe, Instance *instance); }; #endif /* SEQUENCEWRITER_H_ */ diff --git a/src/sequence/TokenDictionary.cpp b/src/sequence/TokenDictionary.cpp index 0def530..aa86241 100644 --- a/src/sequence/TokenDictionary.cpp +++ b/src/sequence/TokenDictionary.cpp @@ -62,8 +62,9 @@ void TokenDictionary::Load(FILE* fs) { if (0 > cpos_alphabet_.Load(fs)) CHECK(false); if (0 > shape_alphabet_.Load(fs)) CHECK(false); - // TODO: Remove this (only for debugging purposes) - //BuildNames(); +#ifndef NDEBUG + BuildNames(); +#endif } void TokenDictionary::Save(FILE* fs) { @@ -140,8 +141,8 @@ void TokenDictionary::Initialize(SequenceReader *reader) { // Add form to alphabet. std::string form = instance->GetForm(i); std::string form_lower(form); - transform(form_lower.begin(), form_lower.end(), - form_lower.begin(), ::tolower); + std::transform(form_lower.begin(), form_lower.end(), + form_lower.begin(), ::tolower); if (!form_case_sensitive) form = form_lower; id = form_alphabet.Insert(form); if (id >= form_freqs.size()) { @@ -261,4 +262,4 @@ void TokenDictionary::Initialize(SequenceReader *reader) { CHECK_LT(feats_alphabet_.size(), 0xffff); CHECK_LT(pos_alphabet_.size(), 0xff); CHECK_LT(cpos_alphabet_.size(), 0xff); -} +} diff --git a/src/tagger/TaggerDictionary.cpp b/src/tagger/TaggerDictionary.cpp index b69a5b1..f4c163b 100644 --- a/src/tagger/TaggerDictionary.cpp +++ b/src/tagger/TaggerDictionary.cpp @@ -40,7 +40,7 @@ void TaggerDictionary::CreateTagDictionary(SequenceReader *reader) { int id; string form = instance->GetForm(i); if (!form_case_sensitive) { - transform(form.begin(), form.end(), form.begin(), ::tolower); + std::transform(form.begin(), form.end(), form.begin(), ::tolower); } int word_id = token_dictionary_->GetFormId(form); //CHECK_GE(word_id, 0); @@ -92,4 +92,4 @@ void TaggerDictionary::CreateTagDictionary(SequenceReader *reader) { } } LOG(INFO) << "Number of unknown word tags: " << unknown_word_tags_.size(); -} +} diff --git a/src/tagger/TaggerDictionary.h b/src/tagger/TaggerDictionary.h index eae1b4a..475a1ad 100644 --- a/src/tagger/TaggerDictionary.h +++ b/src/tagger/TaggerDictionary.h @@ -1,110 +1,110 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef TAGGERDICTIONARY_H_ -#define TAGGERDICTIONARY_H_ - -#include "SequenceDictionary.h" - -class TaggerDictionary : public SequenceDictionary { -public: - TaggerDictionary() {} - TaggerDictionary(Pipe* pipe) : SequenceDictionary(pipe) {} - virtual ~TaggerDictionary() {} - - void Clear() { - SequenceDictionary::Clear(); - word_tags_.clear(); - } - - void Save(FILE *fs) { - SequenceDictionary::Save(fs); - bool success; - int length = unknown_word_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < unknown_word_tags_.size(); ++j) { - int tag = unknown_word_tags_[j]; - success = WriteInteger(fs, tag); - CHECK(success); - } - - length = word_tags_.size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int i = 0; i < word_tags_.size(); ++i) { - length = word_tags_[i].size(); - success = WriteInteger(fs, length); - CHECK(success); - for (int j = 0; j < word_tags_[i].size(); ++j) { - int tag = word_tags_[i][j]; - success = WriteInteger(fs, tag); - CHECK(success); - } - } - } - - void Load(FILE *fs) { - SequenceDictionary::Load(fs); - bool success; - int length; - success = ReadInteger(fs, &length); - CHECK(success); - unknown_word_tags_.resize(length); - for (int j = 0; j < unknown_word_tags_.size(); ++j) { - int tag; - success = ReadInteger(fs, &tag); - CHECK(success); - unknown_word_tags_[j] = tag; - } - success = ReadInteger(fs, &length); - CHECK(success); - word_tags_.resize(length); - for (int i = 0; i < word_tags_.size(); ++i) { - success = ReadInteger(fs, &length); - CHECK(success); - word_tags_[i].resize(length); - for (int j = 0; j < word_tags_[i].size(); ++j) { - int tag; - success = ReadInteger(fs, &tag); - CHECK(success); - word_tags_[i][j] = tag; - } - } - } - - void CreateTagDictionary(SequenceReader *reader); - - const vector &GetWordTags(int word) { - // return word_tags_[word]; - // TODO: Not sure is this should be done here... - // It may be cleaner to return an empty vector here and - // fill it with the unknown tags elsewhere. - if (!word_tags_[word].empty()) { - return word_tags_[word]; - } else { - return unknown_word_tags_; - } - } - -protected: - vector > word_tags_; - vector unknown_word_tags_; -}; - -#endif /* TAGGERDICTIONARY_H_ */ +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef TAGGERDICTIONARY_H_ +#define TAGGERDICTIONARY_H_ + +#include "SequenceDictionary.h" + +class TaggerDictionary : public SequenceDictionary { +public: + TaggerDictionary() {} + TaggerDictionary(Pipe* pipe) : SequenceDictionary(pipe) {} + virtual ~TaggerDictionary() {} + + void Clear() { + SequenceDictionary::Clear(); + word_tags_.clear(); + } + + void Save(FILE *fs) { + SequenceDictionary::Save(fs); + bool success; + int length = (int)unknown_word_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < unknown_word_tags_.size(); ++j) { + int tag = unknown_word_tags_[j]; + success = WriteInteger(fs, tag); + CHECK(success); + } + + length = (int)word_tags_.size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int i = 0; i < word_tags_.size(); ++i) { + length = (int)word_tags_[i].size(); + success = WriteInteger(fs, length); + CHECK(success); + for (int j = 0; j < word_tags_[i].size(); ++j) { + int tag = word_tags_[i][j]; + success = WriteInteger(fs, tag); + CHECK(success); + } + } + } + + void Load(FILE *fs) { + SequenceDictionary::Load(fs); + bool success; + int length; + success = ReadInteger(fs, &length); + CHECK(success); + unknown_word_tags_.resize(length); + for (int j = 0; j < unknown_word_tags_.size(); ++j) { + int tag; + success = ReadInteger(fs, &tag); + CHECK(success); + unknown_word_tags_[j] = tag; + } + success = ReadInteger(fs, &length); + CHECK(success); + word_tags_.resize(length); + for (int i = 0; i < word_tags_.size(); ++i) { + success = ReadInteger(fs, &length); + CHECK(success); + word_tags_[i].resize(length); + for (int j = 0; j < word_tags_[i].size(); ++j) { + int tag; + success = ReadInteger(fs, &tag); + CHECK(success); + word_tags_[i][j] = tag; + } + } + } + + void CreateTagDictionary(SequenceReader *reader); + + const vector &GetWordTags(int word) { + // return word_tags_[word]; + // TODO: Not sure is this should be done here... + // It may be cleaner to return an empty vector here and + // fill it with the unknown tags elsewhere. + if (!word_tags_[word].empty()) { + return word_tags_[word]; + } else { + return unknown_word_tags_; + } + } + +protected: + vector > word_tags_; + vector unknown_word_tags_; +}; + +#endif /* TAGGERDICTIONARY_H_ */ diff --git a/src/tagger/TurboTagger.cpp b/src/tagger/TurboTagger.cpp index 7b2068a..a308b92 100644 --- a/src/tagger/TurboTagger.cpp +++ b/src/tagger/TurboTagger.cpp @@ -1,87 +1,85 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "Utils.h" -#include "TaggerPipe.h" -//#include "StringUtils.h" - -using namespace std; - -void TrainTagger(); -void TestTagger(); - -int main(int argc, char** argv) { - // Initialize Google's logging library. - google::InitGoogleLogging(argv[0]); - - // Parse command line flags. - google::ParseCommandLineFlags(&argc, &argv, true); - -#ifdef _WIN32 - google::LogToStderr(); -#endif - if (FLAGS_train) { - LOG(INFO) << "Training tagger..." << endl; - TrainTagger(); - } else if (FLAGS_test) { - LOG(INFO) << "Running tagger..." << endl; - TestTagger(); - } - - // Destroy allocated memory regarding line flags. - google::ShutDownCommandLineFlags(); - google::ShutdownGoogleLogging(); - return 0; -} - -void TrainTagger() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - TaggerOptions *options = new TaggerOptions; - options->Initialize(); - - TaggerPipe *pipe = new TaggerPipe(options); - pipe->Initialize(); - pipe->Train(); - pipe->SaveModelFile(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Training took " << static_cast(time) / 1000.0 - << " sec." << endl; - - delete pipe; - delete options; -} - -void TestTagger() { - int time; - timeval start, end; - gettimeofday(&start, NULL); - - TaggerOptions *options = new TaggerOptions; - options->Initialize(); - - TaggerPipe *pipe = new TaggerPipe(options); - pipe->Initialize(); - pipe->LoadModelFile(); - pipe->Run(); - - gettimeofday(&end, NULL); - time = diff_ms(end, start); - - LOG(INFO) << "Testing took " << static_cast(time) / 1000.0 - << " sec." << endl; - - delete pipe; - delete options; -} +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Utils.h" +#include "TaggerPipe.h" +//#include "StringUtils.h" + +using namespace std; + +void TrainTagger(); +void TestTagger(); + +int main(int argc, char** argv) { + // Initialize Google's logging library. + google::InitGoogleLogging(argv[0]); + + // Parse command line flags. + google::ParseCommandLineFlags(&argc, &argv, true); + +#ifdef _WIN32 + google::LogToStderr(); +#endif + if (FLAGS_train) { + LOG(INFO) << "Training tagger..." << endl; + TrainTagger(); + } else if (FLAGS_test) { + LOG(INFO) << "Running tagger..." << endl; + TestTagger(); + } + + // Destroy allocated memory regarding line flags. + google::ShutDownCommandLineFlags(); + google::ShutdownGoogleLogging(); + return 0; +} + +void TrainTagger() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + TaggerOptions *options = new TaggerOptions; + options->Initialize(); + + TaggerPipe *pipe = new TaggerPipe(options); + pipe->Initialize(); + pipe->Train(); + pipe->SaveModelFile(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Training took " << time << " sec." << endl; +} + +void TestTagger() { + double time; + chronowrap::Chronometer chrono; + chrono.GetTime(); + + TaggerOptions *options = new TaggerOptions; + options->Initialize(); + + TaggerPipe *pipe = new TaggerPipe(options); + pipe->Initialize(); + pipe->LoadModelFile(); + pipe->Run(); + + delete pipe; + delete options; + + chrono.StopTime(); + time = chrono.GetElapsedTime(); + + LOG(INFO) << "Testing took " << time << " sec." << endl; +} diff --git a/src/util/SerializationUtils.cpp b/src/util/SerializationUtils.cpp index 0e45a0a..ec050ad 100644 --- a/src/util/SerializationUtils.cpp +++ b/src/util/SerializationUtils.cpp @@ -1,111 +1,129 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#include "SerializationUtils.h" -#include - -bool WriteString(FILE *fs, const std::string& data) { - const char *buffer = data.c_str(); - int length = strlen(buffer); - if (1 != fwrite(&length, sizeof(int), 1, fs)) return false; - if (length != fwrite(buffer, sizeof(char), length, fs)) return false; - return true; -} - -bool WriteBool(FILE *fs, bool value) { - if (1 != fwrite(&value, sizeof(bool), 1, fs)) return false; - return true; -} - -bool WriteInteger(FILE *fs, int value) { - if (1 != fwrite(&value, sizeof(int), 1, fs)) return false; - return true; -} - -bool WriteUINT8(FILE *fs, uint8_t value) { - if (1 != fwrite(&value, sizeof(uint8_t), 1, fs)) return false; - return true; -} - -bool WriteUINT64(FILE *fs, uint64_t value) { - if (1 != fwrite(&value, sizeof(uint64_t), 1, fs)) return false; - return true; -} - -bool WriteDouble(FILE *fs, double value) { - if (1 != fwrite(&value, sizeof(double), 1, fs)) return false; - return true; -} - -bool WriteIntegerVector(FILE *fs, const std::vector &values) { - int length = values.size(); - if (!WriteInteger(fs, length)) return false; - for (int i = 0; i < length; ++i) { - int value = values[i]; - if (!WriteInteger(fs, value)) return false; - } - return true; -} - -bool ReadString(FILE *fs, std::string *data) { - int length; - if (1 != fread(&length, sizeof(int), 1, fs)) return false; - char *buffer = new char[length + 1]; - if (length != fread(buffer, sizeof(char), length, fs)) return false; - buffer[length] = '\0'; - (*data).assign(buffer, length); //*data = buffer; - delete[] buffer; - return true; -} - -bool ReadBool(FILE *fs, bool *value) { - if (1 != fread(value, sizeof(bool), 1, fs)) return false; - return true; -} - -bool ReadInteger(FILE *fs, int *value) { - if (1 != fread(value, sizeof(int), 1, fs)) return false; - return true; -} - -bool ReadUINT8(FILE *fs, uint8_t *value) { - if (1 != fread(value, sizeof(uint8_t), 1, fs)) return false; - return true; -} - -bool ReadUINT64(FILE *fs, uint64_t *value) { - if (1 != fread(value, sizeof(uint64_t), 1, fs)) return false; - return true; -} - -bool ReadDouble(FILE *fs, double *value) { - if (1 != fread(value, sizeof(double), 1, fs)) return false; - return true; -} - -bool ReadIntegerVector(FILE *fs, std::vector *values) { - int length; - if (!ReadInteger(fs, &length)) return false; - values->resize(length); - for (int i = 0; i < length; ++i) { - int value; - if (!ReadInteger(fs, &value)) return false; - (*values)[i] = value; - } - return true; -} +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#include "SerializationUtils.h" +#include + +bool WriteString(FILE *fs, const std::string& data) { + const char *buffer = data.c_str(); + int length = strlen(buffer); + if (1 != fwrite(&length, sizeof(int), 1, fs)) + return false; + if (length != fwrite(buffer, sizeof(char), length, fs)) + return false; + return true; +} + +bool WriteBool(FILE *fs, bool value) { + if (1 != fwrite(&value, sizeof(bool), 1, fs)) + return false; + return true; +} + +bool WriteInteger(FILE *fs, int value) { + if (1 != fwrite(&value, sizeof(int), 1, fs)) + return false; + return true; +} + +bool WriteUINT8(FILE *fs, uint8_t value) { + if (1 != fwrite(&value, sizeof(uint8_t), 1, fs)) + return false; + return true; +} + +bool WriteUINT64(FILE *fs, uint64_t value) { + if (1 != fwrite(&value, sizeof(uint64_t), 1, fs)) + return false; + return true; +} + +bool WriteDouble(FILE *fs, double value) { + if (1 != fwrite(&value, sizeof(double), 1, fs)) + return false; + return true; +} + +bool WriteIntegerVector(FILE *fs, const std::vector &values) { + int length = values.size(); + if (!WriteInteger(fs, length)) + return false; + for (int i = 0; i < length; ++i) { + int value = values[i]; + if (!WriteInteger(fs, value)) + return false; + } + return true; +} + +bool ReadString(FILE *fs, std::string *data) { + int length; + if (1 != fread(&length, sizeof(int), 1, fs)) + return false; + char *buffer = new char[length + 1]; + if (length != fread(buffer, sizeof(char), length, fs)) + return false; + buffer[length] = '\0'; + (*data).assign(buffer, length); //*data = buffer; + delete[] buffer; + return true; +} + +bool ReadBool(FILE *fs, bool *value) { + if (1 != fread(value, sizeof(bool), 1, fs)) + return false; + return true; +} + +bool ReadInteger(FILE *fs, int *value) { + if (1 != fread(value, sizeof(int), 1, fs)) + return false; + return true; +} + +bool ReadUINT8(FILE *fs, uint8_t *value) { + if (1 != fread(value, sizeof(uint8_t), 1, fs)) + return false; + return true; +} + +bool ReadUINT64(FILE *fs, uint64_t *value) { + if (1 != fread(value, sizeof(uint64_t), 1, fs)) + return false; + return true; +} + +bool ReadDouble(FILE *fs, double *value) { + if (1 != fread(value, sizeof(double), 1, fs)) + return false; + return true; +} + +bool ReadIntegerVector(FILE *fs, std::vector *values) { + int length; + if (!ReadInteger(fs, &length)) + return false; + values->resize(length); + for (int i = 0; i < length; ++i) { + int value; + if (!ReadInteger(fs, &value)) + return false; + (*values)[i] = value; + } + return true; +} diff --git a/src/util/TimeUtils.cpp b/src/util/TimeUtils.cpp index 5be6315..4aa53b7 100644 --- a/src/util/TimeUtils.cpp +++ b/src/util/TimeUtils.cpp @@ -15,64 +15,5 @@ // // You should have received a copy of the GNU Lesser General Public License // along with TurboParser 2.3. If not, see . -#ifdef _WIN32 -#include -#endif -#include "TimeUtils.h" -#ifdef _WIN32 - -#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 -#else -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -struct timezone { - int tz_minuteswest; /* minutes W of Greenwich */ - int tz_dsttime; /* type of dst correction */ -}; -#if 0 -int gettimeofday(struct timeval *tv, struct timezone *tz) { - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tmpres /= 10; /*convert into microseconds*/ - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - if (NULL != tz) { - if (!tzflag) { - _tzset(); - tzflag++; - } - tz->tz_minuteswest = _timezone / 60; - tz->tz_dsttime = _daylight; - } - - return 0; -} -#endif -#endif -// Time difference in milliseconds. -int diff_ms(timeval t1, timeval t2) { - return (((t1.tv_sec - t2.tv_sec) * 1000000) + - (t1.tv_usec - t2.tv_usec)) / 1000; -} - -// Time difference in microseconds. -int diff_us(timeval t1, timeval t2) { - return (((t1.tv_sec - t2.tv_sec) * 1000000000) + - (t1.tv_usec - t2.tv_usec)); -} +#include "TimeUtils.h" diff --git a/src/util/TimeUtils.h b/src/util/TimeUtils.h index 974a719..955e9c2 100644 --- a/src/util/TimeUtils.h +++ b/src/util/TimeUtils.h @@ -19,26 +19,6 @@ #ifndef TIMEUTILS_H #define TIMEUTILS_H -#ifdef _WIN32 -#include -#else -#include -#endif - -#ifdef _WIN32 -//#include //I've ommited this line. -#ifndef _WINSOCKAPI_ -struct timeval { - long tv_sec; /* seconds */ - long tv_usec; /* and microseconds */ -}; -#endif -extern int gettimeofday(struct timeval *tv, struct timezone *tz); -#endif -using namespace std; - -extern int diff_ms(timeval t1, timeval t2); - -extern int diff_us(timeval t1, timeval t2); +#include "chrono.h" #endif // TIME_UTILS_H diff --git a/src/util/Utils.h b/src/util/Utils.h index cd2ae11..88b5fe5 100644 --- a/src/util/Utils.h +++ b/src/util/Utils.h @@ -1,27 +1,61 @@ -// Copyright (c) 2012-2015 Andre Martins -// All Rights Reserved. -// -// This file is part of TurboParser 2.3. -// -// TurboParser 2.3 is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// TurboParser 2.3 is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with TurboParser 2.3. If not, see . - -#ifndef UTILS_H -#define UTILS_H - -#include -#include -#include "TimeUtils.h" -#include "StringUtils.h" - -#endif // UTILS_H +// Copyright (c) 2012-2015 Andre Martins +// All Rights Reserved. +// +// This file is part of TurboParser 2.3. +// +// TurboParser 2.3 is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// TurboParser 2.3 is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with TurboParser 2.3. If not, see . + +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include "TimeUtils.h" +#include "StringUtils.h" + +class GlogIsInit { +public: + static GlogIsInit & Instance() { + static GlogIsInit instance; + return instance; + } + + static bool Value() { + return Instance().value_; + } + + static void Set(bool value) { + Instance().value_.store(value); + } +private: + GlogIsInit() {}; + std::atomic_bool value_{ false }; +}; + +static void InitGlog(const char * logging_name) { + + + if (!GlogIsInit::Value()) { + //fLB::FLAGS_colorlogtostderr = true; + fLB::FLAGS_alsologtostderr = true; + fLI::FLAGS_stderrthreshold = 0; + fLI::FLAGS_minloglevel = 0; + fLI::FLAGS_v = 0; + google::InitGoogleLogging(logging_name); + GlogIsInit::Set(true); + } +} + +#endif // UTILS_H diff --git a/src/util/chrono.h b/src/util/chrono.h new file mode 100644 index 0000000..eaeec74 --- /dev/null +++ b/src/util/chrono.h @@ -0,0 +1,106 @@ +/** +* @file chrono.h +* @author David Alberto Nogueira (dan) +* @brief std::chrono wrapper. +* +* USAGE: +* @code{.cpp} +* chronowrap::Chronometer chrono; //Declare a Chronometer +* chrono.GetTime(); //Start timer +* { +* ... //do your code +* } +* chrono.StopTime(); //Stop timer +* std::cout << "Time: " << chrono.GetElapsedTime() +* << " sec." << std::endl; //Print duration + +* @endcode +* +* @copyright Copyright (c) 2016, David Alberto Nogueira. +* All rights reserved. See licence below. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: +* +* (1) Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* (2) Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* +* (3) The name of the author may not be used to +* endorse or promote products derived from this software without +* specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, +* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CHRONO_H +#define CHRONO_H +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif + +namespace chronowrap { +class Chronometer { +public: + Chronometer() { + time_span = std::chrono::steady_clock::duration::zero(); + }; + virtual ~Chronometer() {}; + + void GetTime() { + clock_begin = std::chrono::steady_clock::now(); + } + void StopTime() { + std::chrono::steady_clock::time_point clock_end = + std::chrono::steady_clock::now(); + time_span += clock_end - clock_begin; + } + //Return elapsed time in seconds + double GetElapsedTime() { + return double(time_span.count()) * resolution; + } + void Reset() { + time_span = std::chrono::steady_clock::duration::zero(); + } + //in us + double GetClockResolutionUS() { + return resolution*1e6; + } + void PrintClockResolution() { + std::cout << "clock::period: " << GetClockResolutionUS() << " us.\n"; + } + bool IsClockSteady() { + return std::chrono::steady_clock::is_steady; + } + void PrintClockSteady() { + printf("clock::is_steady: %s\n", IsClockSteady() ? "yes" : "no"); + } + +protected: + std::chrono::steady_clock::time_point clock_begin; + std::chrono::steady_clock::duration time_span; + const double resolution = double(std::chrono::steady_clock::period::num) / + double(std::chrono::steady_clock::period::den); +}; +} + +#endif // CHRONO_H diff --git a/vsprojects/classifier/classifier.vcxproj b/vsprojects/classifier/classifier.vcxproj index 130e3ec..28c72db 100644 --- a/vsprojects/classifier/classifier.vcxproj +++ b/vsprojects/classifier/classifier.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -42,40 +42,40 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -129,7 +129,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) /D GFLAGS_DLL_DECL= /D GFLAGS_DLL_DECLARE_FLAG= /D GFLAGS_DLL_DEFINE_FLAG= %(AdditionalOptions) Disabled @@ -185,7 +185,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) /D GFLAGS_DLL_DECL= /D GFLAGS_DLL_DECLARE_FLAG= /D GFLAGS_DLL_DEFINE_FLAG= %(AdditionalOptions) $(SolutionDir)..\src\util;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable @@ -208,7 +208,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) /D GFLAGS_DLL_DECL= /D GFLAGS_DLL_DECLARE_FLAG= /D GFLAGS_DLL_DEFINE_FLAG= %(AdditionalOptions) $(SolutionDir)..\src\util;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled diff --git a/vsprojects/constituency_labeler/constituency_labeler.vcxproj b/vsprojects/constituency_labeler/constituency_labeler.vcxproj index 3dfd1d5..204fc3c 100644 --- a/vsprojects/constituency_labeler/constituency_labeler.vcxproj +++ b/vsprojects/constituency_labeler/constituency_labeler.vcxproj @@ -56,20 +56,20 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -134,7 +134,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -187,7 +187,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) @@ -205,7 +205,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled diff --git a/vsprojects/coreference_resolver/coreference_resolver.vcxproj b/vsprojects/coreference_resolver/coreference_resolver.vcxproj index 84376b7..4c9a22a 100644 --- a/vsprojects/coreference_resolver/coreference_resolver.vcxproj +++ b/vsprojects/coreference_resolver/coreference_resolver.vcxproj @@ -56,20 +56,20 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -134,7 +134,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -185,7 +185,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) @@ -203,7 +203,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled diff --git a/vsprojects/dependency_labeler/dependency_labeler.vcxproj b/vsprojects/dependency_labeler/dependency_labeler.vcxproj index 4dacdb3..3dd7851 100644 --- a/vsprojects/dependency_labeler/dependency_labeler.vcxproj +++ b/vsprojects/dependency_labeler/dependency_labeler.vcxproj @@ -56,20 +56,20 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -134,7 +134,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled @@ -185,7 +185,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) @@ -203,7 +203,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled diff --git a/vsprojects/entity_recognizer/entity_recognizer.vcxproj b/vsprojects/entity_recognizer/entity_recognizer.vcxproj index e60c0df..29494c6 100644 --- a/vsprojects/entity_recognizer/entity_recognizer.vcxproj +++ b/vsprojects/entity_recognizer/entity_recognizer.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -35,40 +35,40 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -132,7 +132,7 @@ NotUsing Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled @@ -187,7 +187,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed @@ -209,7 +209,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -226,6 +226,7 @@ + @@ -236,6 +237,7 @@ + diff --git a/vsprojects/libturboparser/libturboparser.vcxproj b/vsprojects/libturboparser/libturboparser.vcxproj index 6e99a18..3944d96 100644 --- a/vsprojects/libturboparser/libturboparser.vcxproj +++ b/vsprojects/libturboparser/libturboparser.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -35,40 +35,40 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -115,7 +115,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\morphological_tagger;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\tagger;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\src\coreference_resolver;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -168,7 +168,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\morphological_tagger;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\tagger;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\src\coreference_resolver;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed @@ -190,11 +190,10 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\morphological_tagger;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\tagger;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\src\coreference_resolver;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled - - + Neither false false false @@ -225,6 +224,23 @@ + + + + + + + + + + + + + + + + + @@ -235,6 +251,15 @@ + + + + + + + + + @@ -298,6 +323,20 @@ + + + + + + + + + + + + + + @@ -306,6 +345,14 @@ + + + + + + + + @@ -345,6 +392,9 @@ + + + diff --git a/vsprojects/morphological_tagger/morphological_tagger.vcxproj b/vsprojects/morphological_tagger/morphological_tagger.vcxproj index a33c698..623c319 100644 --- a/vsprojects/morphological_tagger/morphological_tagger.vcxproj +++ b/vsprojects/morphological_tagger/morphological_tagger.vcxproj @@ -1,252 +1,252 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Release O0 - Win32 - - - Release O0 - x64 - - - Release - Win32 - - - Release - x64 - - - - {ECAF13E4-5AB5-4F92-9EFE-565E79F0CD21} - Win32Proj - morphological_tagger - - - - StaticLibrary - true - v120 - Unicode - - - StaticLibrary - true - v120 - Unicode - - - StaticLibrary - false - v120 - true - Unicode - - - StaticLibrary - false - v120 - true - Unicode - - - StaticLibrary - false - v120 - true - Unicode - - - StaticLibrary - false - v120 - true - Unicode - - - - - - - - - - - - - - - - - - - - - - - - - true - - - true - - - false - - - false - - - false - - - false - - - - NotUsing - Level3 - Disabled - WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) - $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) - - - Console - true - ..\..\deps\glog-0.3.2\Debug;..\..\deps\gflags-2.0\Debug;..\..\deps\AD3-2.0.2\vsprojects\Debug - AD3.lib;libgflags-debug.lib;libglog.lib;%(AdditionalDependencies) - - - - - NotUsing - Level3 - Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) - Disabled - - - Console - true - ..\..\deps\glog-0.3.2\Debug;..\..\deps\gflags-2.0\Debug;..\..\deps\AD3-2.0.2\vsprojects\Debug - AD3.lib;libgflags-debug.lib;libglog.lib;%(AdditionalDependencies) - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) - $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) - - - Windows - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) - $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) - - - Windows - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) - AnySuitable - Speed - true - true - - - Windows - true - true - true - - - - - Level3 - - - Disabled - true - false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) - Disabled - - - false - false - false - - - Windows - true - true - true - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Release O0 + Win32 + + + Release O0 + x64 + + + Release + Win32 + + + Release + x64 + + + + {ECAF13E4-5AB5-4F92-9EFE-565E79F0CD21} + Win32Proj + morphological_tagger + + + + StaticLibrary + true + v140 + Unicode + + + StaticLibrary + true + v140 + Unicode + + + StaticLibrary + false + v140 + true + Unicode + + + StaticLibrary + false + v140 + true + Unicode + + + StaticLibrary + false + v140 + true + Unicode + + + StaticLibrary + false + v140 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + false + + + false + + + + NotUsing + Level3 + Disabled + WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) + $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) + + + Console + true + ..\..\deps\glog-0.3.2\Debug;..\..\deps\gflags-2.0\Debug;..\..\deps\AD3-2.0.2\vsprojects\Debug + AD3.lib;libgflags-debug.lib;libglog.lib;%(AdditionalDependencies) + + + + + NotUsing + Level3 + Disabled + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + Disabled + + + Console + true + ..\..\deps\glog-0.3.2\Debug;..\..\deps\gflags-2.0\Debug;..\..\deps\AD3-2.0.2\vsprojects\Debug + AD3.lib;libgflags-debug.lib;libglog.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) + $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) + + + Windows + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) + $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) + + + Windows + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + AnySuitable + Speed + true + true + + + Windows + true + true + true + + + + + Level3 + + + Disabled + true + false + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + Disabled + + + false + false + false + + + Windows + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/vsprojects/parser/parser.vcxproj b/vsprojects/parser/parser.vcxproj index 08f9961..4cead66 100644 --- a/vsprojects/parser/parser.vcxproj +++ b/vsprojects/parser/parser.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -33,35 +33,35 @@ StaticLibrary true - v120 + v140 StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 StaticLibrary false - v120 + v140 StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -100,7 +100,7 @@ $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) Level3 Disabled @@ -137,7 +137,7 @@ true true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) @@ -153,7 +153,7 @@ false false false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) Disabled diff --git a/vsprojects/semantic_parser/semantic_parser.vcxproj b/vsprojects/semantic_parser/semantic_parser.vcxproj index 00e2e51..5dedbc6 100644 --- a/vsprojects/semantic_parser/semantic_parser.vcxproj +++ b/vsprojects/semantic_parser/semantic_parser.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -35,40 +35,40 @@ Application true - v120 + v140 Unicode StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -134,7 +134,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled @@ -189,7 +189,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed @@ -211,7 +211,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled diff --git a/vsprojects/sequence/sequence.vcxproj b/vsprojects/sequence/sequence.vcxproj index 786f51e..dc8075d 100644 --- a/vsprojects/sequence/sequence.vcxproj +++ b/vsprojects/sequence/sequence.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -35,40 +35,40 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -115,7 +115,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -168,7 +168,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed @@ -190,7 +190,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled diff --git a/vsprojects/tagger/tagger.vcxproj b/vsprojects/tagger/tagger.vcxproj index f836eb7..65a731b 100644 --- a/vsprojects/tagger/tagger.vcxproj +++ b/vsprojects/tagger/tagger.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -41,40 +41,40 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -141,7 +141,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) /wd4018 /D GFLAGS_DLL_DECL= /D GFLAGS_DLL_DECLARE_FLAG= /D GFLAGS_DLL_DEFINE_FLAG= %(AdditionalOptions) Disabled @@ -205,7 +205,7 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) /D GFLAGS_DLL_DECL= /D GFLAGS_DLL_DECLARE_FLAG= /D GFLAGS_DLL_DEFINE_FLAG= %(AdditionalOptions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable @@ -231,7 +231,7 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) /D GFLAGS_DLL_DECL= /D GFLAGS_DLL_DECLARE_FLAG= /D GFLAGS_DLL_DEFINE_FLAG= %(AdditionalOptions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled diff --git a/vsprojects/turbo_constituency_labeler/turbo_constituency_labeler.vcxproj b/vsprojects/turbo_constituency_labeler/turbo_constituency_labeler.vcxproj index e893a79..700a21d 100644 --- a/vsprojects/turbo_constituency_labeler/turbo_constituency_labeler.vcxproj +++ b/vsprojects/turbo_constituency_labeler/turbo_constituency_labeler.vcxproj @@ -56,20 +56,20 @@ Application true - v120 + v140 Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode @@ -134,14 +134,14 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled Console true - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug @@ -187,7 +187,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) @@ -196,7 +196,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) @@ -207,7 +207,7 @@ Disabled true false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -220,7 +220,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) diff --git a/vsprojects/turbo_coreference_resolver/turbo_coreference_resolver.vcxproj b/vsprojects/turbo_coreference_resolver/turbo_coreference_resolver.vcxproj index ecd9a3c..1c28bca 100644 --- a/vsprojects/turbo_coreference_resolver/turbo_coreference_resolver.vcxproj +++ b/vsprojects/turbo_coreference_resolver/turbo_coreference_resolver.vcxproj @@ -56,20 +56,20 @@ Application true - v120 + v140 Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode @@ -134,14 +134,14 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled Console true - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug @@ -187,7 +187,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) @@ -196,7 +196,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) @@ -207,7 +207,7 @@ Disabled true false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\src\semantic_parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -220,7 +220,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) diff --git a/vsprojects/turbo_dependency_labeler/turbo_dependency_labeler.vcxproj b/vsprojects/turbo_dependency_labeler/turbo_dependency_labeler.vcxproj index f096830..3902bcb 100644 --- a/vsprojects/turbo_dependency_labeler/turbo_dependency_labeler.vcxproj +++ b/vsprojects/turbo_dependency_labeler/turbo_dependency_labeler.vcxproj @@ -56,20 +56,20 @@ Application true - v120 + v140 Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode @@ -134,14 +134,14 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled Console true - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug @@ -187,7 +187,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) @@ -196,7 +196,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) @@ -207,7 +207,7 @@ Disabled true false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -220,7 +220,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) diff --git a/vsprojects/turbo_entity_recognizer/turbo_entity_recognizer.vcxproj b/vsprojects/turbo_entity_recognizer/turbo_entity_recognizer.vcxproj index 770b1ec..4337ecb 100644 --- a/vsprojects/turbo_entity_recognizer/turbo_entity_recognizer.vcxproj +++ b/vsprojects/turbo_entity_recognizer/turbo_entity_recognizer.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -36,40 +36,40 @@ Application true - v120 + v140 Unicode Application true - v120 + v140 Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode @@ -135,7 +135,7 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled @@ -143,7 +143,7 @@ Console true ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) @@ -194,7 +194,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed @@ -207,7 +207,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) @@ -218,7 +218,7 @@ Disabled true false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -233,7 +233,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) diff --git a/vsprojects/turbo_morphological_tagger/turbo_morphological_tagger.vcxproj b/vsprojects/turbo_morphological_tagger/turbo_morphological_tagger.vcxproj index 8e09301..6e72ca8 100644 --- a/vsprojects/turbo_morphological_tagger/turbo_morphological_tagger.vcxproj +++ b/vsprojects/turbo_morphological_tagger/turbo_morphological_tagger.vcxproj @@ -1,265 +1,265 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Release O0 - Win32 - - - Release O0 - x64 - - - Release - Win32 - - - Release - x64 - - - - {B9C929EE-DAA3-443C-BE47-4B7CEF435164} - Win32Proj - turbo_morphological_tagger - turbo_morphological_tagger - - - - Application - true - v120 - Unicode - - - Application - true - v120 - Unicode - - - Application - false - v120 - true - Unicode - - - Application - false - v120 - true - Unicode - - - Application - false - v120 - true - Unicode - - - Application - false - v120 - true - Unicode - - - - - - - - - - - - - - - - - - - - - - - - - true - - - true - - - false - - - false - - - false - - - false - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) - - - Console - true - ..\..\deps\glog-0.3.2\Debug;..\..\deps\gflags-2.0\Debug;..\..\deps\AD3-2.0.2\vsprojects\Debug - AD3.lib;libgflags-debug.lib;libglog.lib;%(AdditionalDependencies) - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) - $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) - Disabled - - - Console - true - ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) - - - Console - true - true - true - ..\..\deps\glog-0.3.2\Release;..\..\deps\gflags-2.0\Release;..\..\deps\AD3-2.0.2\vsprojects\Release - AD3.lib;libgflags.lib;libglog.lib;%(AdditionalDependencies) - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) - - - Console - true - true - true - ..\..\deps\glog-0.3.2\Release;..\..\deps\gflags-2.0\Release;..\..\deps\AD3-2.0.2\vsprojects\Release - AD3.lib;libgflags.lib;libglog.lib;%(AdditionalDependencies) - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) - $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) - AnySuitable - Speed - true - true - - - Console - true - true - true - ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) - - - - - Level3 - - - Disabled - true - false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) - $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) - Disabled - - - false - false - false - - - Console - true - true - true - ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) - - - - - - - - {572f3882-6014-4fb1-ae04-01a17d4eb041} - - - {26e7dfe3-2fc3-43c0-8499-b6a59669f20a} - - - {ecaf13e4-5ab5-4f92-9efe-565e79f0cd21} - - - {a17d3ac6-4c67-4615-8d6b-ddeabbb7faa5} - - - {6ec765b9-c3cd-4a5d-94c8-851333497db1} - - - {9811eff4-9d77-4000-b2b3-29155a8d722a} - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Release O0 + Win32 + + + Release O0 + x64 + + + Release + Win32 + + + Release + x64 + + + + {B9C929EE-DAA3-443C-BE47-4B7CEF435164} + Win32Proj + turbo_morphological_tagger + turbo_morphological_tagger + + + + Application + true + v140 + Unicode + + + Application + true + v140 + Unicode + + + Application + false + v140 + true + Unicode + + + Application + false + v140 + true + Unicode + + + Application + false + v140 + true + Unicode + + + Application + false + v140 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) + $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) + + + Console + true + ..\..\deps\glog-0.3.2\Debug;..\..\deps\gflags-2.0\Debug;..\..\deps\AD3-2.0.2\vsprojects\Debug + AD3.lib;libgflags-debug.lib;libglog.lib;%(AdditionalDependencies) + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + Disabled + + + Console + true + ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) + $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) + + + Console + true + true + true + ..\..\deps\glog-0.3.2\Release;..\..\deps\gflags-2.0\Release;..\..\deps\AD3-2.0.2\vsprojects\Release + AD3.lib;libgflags.lib;libglog.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) + $(SolutionDir)..\src\parser;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\util;$(SolutionDir)..\deps\\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;%(AdditionalIncludeDirectories) + + + Console + true + true + true + ..\..\deps\glog-0.3.2\Release;..\..\deps\gflags-2.0\Release;..\..\deps\AD3-2.0.2\vsprojects\Release + AD3.lib;libgflags.lib;libglog.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + AnySuitable + Speed + true + true + + + Console + true + true + true + ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) + + + + + Level3 + + + Disabled + true + false + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + Disabled + + + false + false + false + + + Console + true + true + true + ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) + + + + + + + + {572f3882-6014-4fb1-ae04-01a17d4eb041} + + + {26e7dfe3-2fc3-43c0-8499-b6a59669f20a} + + + {ecaf13e4-5ab5-4f92-9efe-565e79f0cd21} + + + {a17d3ac6-4c67-4615-8d6b-ddeabbb7faa5} + + + {6ec765b9-c3cd-4a5d-94c8-851333497db1} + + + {9811eff4-9d77-4000-b2b3-29155a8d722a} + + + + + \ No newline at end of file diff --git a/vsprojects/turbo_parser/turbo_parser.vcxproj b/vsprojects/turbo_parser/turbo_parser.vcxproj index 1047438..71d1310 100644 --- a/vsprojects/turbo_parser/turbo_parser.vcxproj +++ b/vsprojects/turbo_parser/turbo_parser.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -36,40 +36,40 @@ Application true - v120 + v140 Unicode Application true - v120 + v140 Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode @@ -135,14 +135,14 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled Console true - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug @@ -194,7 +194,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed @@ -207,7 +207,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) @@ -218,7 +218,7 @@ Disabled true false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -233,7 +233,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) diff --git a/vsprojects/turbo_semantic_parser/turbo_semantic_parser.vcxproj b/vsprojects/turbo_semantic_parser/turbo_semantic_parser.vcxproj index 769b66a..6ea0d91 100644 --- a/vsprojects/turbo_semantic_parser/turbo_semantic_parser.vcxproj +++ b/vsprojects/turbo_semantic_parser/turbo_semantic_parser.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -59,40 +59,40 @@ Application true - v120 + v140 Unicode Application true - v120 + v140 Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode @@ -158,7 +158,7 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled @@ -166,7 +166,7 @@ Console true ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) @@ -217,7 +217,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) @@ -225,7 +225,7 @@ true true true - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release @@ -237,7 +237,7 @@ Disabled true false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -249,7 +249,7 @@ true true true - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release diff --git a/vsprojects/turbo_tagger/turbo_tagger.vcxproj b/vsprojects/turbo_tagger/turbo_tagger.vcxproj index 6f163ba..15c9f55 100644 --- a/vsprojects/turbo_tagger/turbo_tagger.vcxproj +++ b/vsprojects/turbo_tagger/turbo_tagger.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -36,40 +36,40 @@ Application true - v120 + v140 Unicode Application true - v120 + v140 Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode Application false - v120 + v140 true Unicode @@ -135,7 +135,7 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;%(AdditionalIncludeDirectories) Disabled @@ -143,7 +143,7 @@ Console true ..\..\deps\glog-0.3.2\x64\Debug;..\..\deps\gflags-2.0\x64\Debug;..\..\deps\AD3-2.0.2\vsprojects\x64\Debug;..\..\deps\googletest\msvc\x64\Debug - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mddx64.lib;libgflags_140mddx64.lib;libglog_static_140mddx64.lib;gtest-md_140mddx64.lib;%(AdditionalDependencies) @@ -194,7 +194,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed @@ -207,7 +207,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) @@ -218,7 +218,7 @@ Disabled true false - WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\src\util;$(SolutionDir)..\src\classifier;$(SolutionDir)..\src\sequence;$(SolutionDir)..\src\entity_recognizer;$(SolutionDir)..\src\parser;$(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\AD3-2.0.2\;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\eigen-eigen-c58038c56923\;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -233,7 +233,7 @@ true true ..\..\deps\glog-0.3.2\x64\Release;..\..\deps\gflags-2.0\x64\Release;..\..\deps\AD3-2.0.2\vsprojects\x64\Release;..\..\deps\googletest\msvc\x64\Release - AD3.lib;libgflags.lib;libglog_static.lib;gtest.lib;%(AdditionalDependencies) + AD3_140mdx64.lib;libgflags_140mdx64.lib;libglog_static_140mdx64.lib;gtest-md_140mdx64.lib;%(AdditionalDependencies) diff --git a/vsprojects/turboparser.sln b/vsprojects/turboparser.sln index 6d0fd80..fd0999c 100644 --- a/vsprojects/turboparser.sln +++ b/vsprojects/turboparser.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 14 -VisualStudioVersion = 14.0.23107.0 +VisualStudioVersion = 14.0.25420.1 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "classifier", "classifier\classifier.vcxproj", "{572F3882-6014-4FB1-AE04-01A17D4EB041}" EndProject diff --git a/vsprojects/util/util.vcxproj b/vsprojects/util/util.vcxproj index 842f2ba..537ec66 100644 --- a/vsprojects/util/util.vcxproj +++ b/vsprojects/util/util.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -41,40 +41,40 @@ StaticLibrary true - v120 + v140 Unicode StaticLibrary true - v120 + v140 Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode StaticLibrary false - v120 + v140 true Unicode @@ -121,7 +121,7 @@ Level3 Disabled - WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + WIN32;_DEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) $(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -174,8 +174,8 @@ MaxSpeed true true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) AnySuitable Speed true @@ -196,8 +196,8 @@ Disabled true false - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - $(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;_LIB;GOOGLE_GLOG_DLL_DECL=;GFLAGS_DLL_DECL=;%(PreprocessorDefinitions) + $(SolutionDir)..\deps\AD3-2.0.2\ad3;$(SolutionDir)..\deps\glog-0.3.2\src\windows;$(SolutionDir)..\deps\gflags-2.0\src\windows;$(SolutionDir)..\deps\googletest\src;%(AdditionalIncludeDirectories) Disabled @@ -223,6 +223,7 @@ + diff --git a/vsprojects/util/util.vcxproj.filters b/vsprojects/util/util.vcxproj.filters index 546471a..1de28f4 100644 --- a/vsprojects/util/util.vcxproj.filters +++ b/vsprojects/util/util.vcxproj.filters @@ -1,54 +1,57 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + \ No newline at end of file