From 1ff57a39bd2e255fcbfce9d1d803cedd92b54f0e Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Thu, 5 Feb 2015 16:35:11 -0500 Subject: [PATCH 1/9] improved CLI for loop_tresholds script --- cpp/loop_thresholds.sh | 97 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/cpp/loop_thresholds.sh b/cpp/loop_thresholds.sh index 6268190..bf79735 100755 --- a/cpp/loop_thresholds.sh +++ b/cpp/loop_thresholds.sh @@ -21,17 +21,100 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +usage() { + echo 1>&2 "USAGE: $0 [options] +Loop over many thresholds, recording the clusters at each. Must pass arguments +for -p and -j. -# these need to be updated by you: -EXEC=./clusterJaccards -NET=karate.pairs -JACC=karate.jaccs -OUTDIR=clusters +OPTIONS: + -h Show this message only + -p Pairs file. + -j Jaccard similarity file for given pairs file. + -s Scripts directory, where jaccard calculation and cluster scripts are + -c Alternate name for clustering script; default is clusterJaccards + -o Output directory; defualts to \"clusters\" in cwd; created if absent" +} + +# Error codes +BAD_USAGE=600 +NO_PAIRS_FILE=601 +NO_JACCS_FILE=602 +UNABLE_TO_MAKE_OUTPUT_DIR=603 + +# Parse command line arguments +PAIRS_FILE= +JACCS_FILE= +SCRIPTS_DIR=$(pwd) +OUTPUT_DIR=$(pwd)/clusters +CLUST_SCRIPT="clusterJaccards" + +while (( "$#" )) +do + # For flags with values, shift is used in-line to move past the value. + case $1 in + -h|--help) + usage; + exit 0 + ;; + -p|--pairs-file) + PAIRS_FILE="$2" + shift + ;; + -j|--jacc-sim-file) + JACCS_FILE="$2" + shift + ;; + -s|--scripts-dir) + SCRIPTS_DIR=$2 + shift + ;; + -c|--cluster-script) + CLUST_SCRIPT=$2 + shift + ;; + -o|--output-dir) + OUTPUT_DIR=$2 + shift + ;; + esac + shift # decrement all arglist indices +done + +# Ensure we have the necessary files +if [[ -z "$PAIRS_FILE" ]]; then + echo "Pairs file is required to run. None given." + exit $BAD_USAGE +fi + +if [[ -z "$JACCS_FILE" ]]; then + echo "Jaccard similarity file is required to run. None given." + exit $BAD_USAGE +fi + +# Inform the user of our progress. +echo "Using link community detection scripts from: ${SCRIPTS_DIR}" +echo "Writing to output directory: ${OUTPUT_DIR}" + +# Make output directory if it does not exist +if ! [[ -d $OUTPUT_DIR ]]; then + echo -n "Output directory does not exist. Attempting to create... " + mkdir $OUTPUT_DIR + if [[ $? != 0 ]]; then + echo "failed." + exit $UNABLE_TO_MAKE_OUTPUT_DIR; + fi +fi + +# Set up variables for running the clustering script. This is an example: # $EXEC network.pairs network.jaccs network.clusters network.cluster_stats threshold +EXEC="${SCRIPTS_DIR}/${CLUST_SCRIPT}" for thr in 0.9 0.8 0.7 0.6 0.5 0.4 0.3 0.2 0.1 do - echo $thr - $EXEC $NET $JACC $OUTDIR/network_$thr.cluster $OUTDIR/network_$thr.cluster_stats $thr + echo "Threshold: ${thr}" + $EXEC $PAIRS_FILE $JACCS_FILE \ + $OUTPUT_DIR/network_$thr.cluster \ + $OUTPUT_DIR/network_$thr.cluster_stats \ + $thr done From 2d353efbc2c0f5ca68c5e88278ad05cc06cddfdd Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Thu, 5 Feb 2015 16:51:11 -0500 Subject: [PATCH 2/9] added a simple makefile --- cpp/Makefile | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 cpp/Makefile diff --git a/cpp/Makefile b/cpp/Makefile new file mode 100644 index 0000000..cba6b28 --- /dev/null +++ b/cpp/Makefile @@ -0,0 +1,22 @@ +SHELL = /bin/sh +CC = g++ +FLAGS = -O5 +THRESH= 0.5 + +all: calc clust + +calc: calcAndWrite_Jaccards.cpp + $(CC) $(FLAGS) -o calcJaccards calcAndWrite_Jaccards.cpp + +clust: clusterJaccsFile.cpp + $(CC) $(FLAGS) -o clusterJaccards clusterJaccsFile.cpp + +karate: calc clust + ./calcJaccards karate.pairs karate.jaccs + ./clusterJaccards karate.pairs karate.jaccs \ + karate.clusters_$(THRESH) karate.cluster_stats_$(THRESH) $(THRESH) + +lesmis: calc clust + ./calcJaccards lesmis.pairs lesmis.jaccs + ./clusterJaccards lesmis.pairs lesmis.jaccs \ + lesmis.clsuters_$(THRESH) lesmis.cluster_stats_$(THRESH) $(THRESH) From d3adee6892eae666e19764a178c956042b1a84c4 Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Thu, 5 Feb 2015 19:04:45 -0500 Subject: [PATCH 3/9] updated readme for new makefile and improved looping script --- cpp/README | 63 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/cpp/README b/cpp/README index 2049017..b5b5d05 100644 --- a/cpp/README +++ b/cpp/README @@ -4,59 +4,62 @@ This directory contains C++ code and helper files to perform link clustering for large networks (millions of edges). Since this code is optimized for size and speed, it is unable to store the full -dendrogram, nor compute the maximum partition density. To avoid memory issues, a two-step -process is used: a (possibly very very large) file containing all the link similarities -is created, then scanned over and all edge pairs beyond a given similarity threshold are -clustered (single linkage). You can estimate the maximum partition density by providing -many thresholds (the similarity file need only be computed once). A shell script is -provided to help perform this loop. +dendrogram, nor compute the maximum partition density. To avoid memory issues, a +two-step process is used: a (possibly very very large) file containing all the +link similarities is created, then scanned over, and all edge pairs beyond a +given similarity threshold are clustered (single linkage). You can estimate the +maximum partition density by providing many thresholds (the similarity file need +only be computed once). A shell script is provided to help perform this loop. -The given network must be connected and undirected. Node IDs must be sequentially ordered -integers starting from zero. The network must be stored as an 'edgelist' where each edge -is on its own line of the input file and consists of two integers (the edge's nodes) -separated by a space. This format (edgelist with sequential integer nodes) is referred to -as a '.pairs' file. +The given network must be connected and undirected. Node IDs must be +sequentially ordered integers starting from zero. The network must be stored as +an 'edgelist' where each edge is on its own line of the input file and consists +of two integers (the edge's nodes) separated by a space. This format (edgelist +with sequential integer nodes) is referred to as a '.pairs' file. -For convenience, the file "edgelist2pairs.py" can convert an arbitrary edgelist with any -node ids to the .pairs format. To see how to use it, run - $ python edgelist2pairs.py -h +For convenience, the file "edgelist2pairs.py" can convert an arbitrary edgelist +with any node ids to the .pairs format. To see how to use it, run + $ python edgelist2pairs.py -h -As mentioned, the code consists of two steps: calculate the similarities and then cluster -the edges using a given similarity threshold. +As mentioned, the code consists of two steps: calculate the similarities and +then cluster the edges using a given similarity threshold. You can run `make` to +build the binaries for both steps, or build them separately using the following +instructions. To perform the first step: $ g++ -O5 -o calcJaccards calcAndWrite_Jaccards.cpp $ ./calcJaccards net.pairs net.jaccs -This reads the provided net.pairs and create a (possibly large) net.jaccs file, +This reads the provided net.pairs and create a (possibly large) net.jaccs file, containing all the link similarities. To record the clusters for a given THRESHOLD: $ g++ -O5 -o clusterJaccards clusterJaccsFile.cpp $ ./clusterJaccards net.pairs net.jaccs net.clusters net.mc_nc THRESHOLD -will scan the net.jaccs file, record all the clusters at THRESHOLD in net.clusters, and -the sizes of each cluster (number of edges and number of induced nodes) to net.mc_nc. The -latter is useful for quickly computing the partition density. +This will scan the net.jaccs file, record all the clusters at THRESHOLD in +net.clusters, and the sizes of each cluster (number of edges and number of +induced nodes) to net.mc_nc. The latter is useful for quickly computing the +partition density. partition_density.py can be used to compute the partition density for a cut, using the .mc_nc file. - Finally, two BASH scripts are provided for convenience: - link_clustering.sh - compiles and performs the full calculation (both steps), good for -single runs. Try it with the included example .pairs file. For help, run -'./link_clustering' (with no arguments) from a terminal. - - loop_thresholds.sh - if the link similarity (.jaccs) file has already been created, this -will loop over many thresholds, recording the clusters at each. This script will need -editing to change some variables, and possibly replace the list of thresholds to be -looped over. + link_clustering.sh + compiles and performs the full calculation (both steps), good for single + runs. Try it with the included example .pairs file. For help, run + './link_clustering' (with no arguments) from a terminal. + loop_thresholds.sh + If the link similarity (.jaccs) file has already been created, this will + loop over many thresholds, recording the clusters at each. The default + threshold list combs from 0.1 to 0.9 in increments of 0.1. This can be + changed in the file. Run with the -h flag for additional customization + options. Good luck! -- Jim Bagrow, bagrowjp [at] gmail [dot] com - From 6c7168f75dd693a1ae8dcd048ed1836dfdd6a80c Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Thu, 5 Feb 2015 19:07:07 -0500 Subject: [PATCH 4/9] changed readme to markdown for better viewing in github --- cpp/{README => README.md} | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) rename cpp/{README => README.md} (92%) diff --git a/cpp/README b/cpp/README.md similarity index 92% rename from cpp/README rename to cpp/README.md index b5b5d05..7d32500 100644 --- a/cpp/README +++ b/cpp/README.md @@ -17,7 +17,7 @@ an 'edgelist' where each edge is on its own line of the input file and consists of two integers (the edge's nodes) separated by a space. This format (edgelist with sequential integer nodes) is referred to as a '.pairs' file. -For convenience, the file "edgelist2pairs.py" can convert an arbitrary edgelist +For convenience, the file `edgelist2pairs.py` can convert an arbitrary edgelist with any node ids to the .pairs format. To see how to use it, run $ python edgelist2pairs.py -h @@ -43,17 +43,17 @@ net.clusters, and the sizes of each cluster (number of edges and number of induced nodes) to net.mc_nc. The latter is useful for quickly computing the partition density. -partition_density.py can be used to compute the partition density for a cut, using the -.mc_nc file. +`partition_density.py` can be used to compute the partition density for a cut, +using the .mc_nc file. Finally, two BASH scripts are provided for convenience: - link_clustering.sh + `link_clustering.sh` compiles and performs the full calculation (both steps), good for single runs. Try it with the included example .pairs file. For help, run './link_clustering' (with no arguments) from a terminal. - loop_thresholds.sh + `loop_thresholds.sh` If the link similarity (.jaccs) file has already been created, this will loop over many thresholds, recording the clusters at each. The default threshold list combs from 0.1 to 0.9 in increments of 0.1. This can be From 66e550f280998296c792ffdf92eeda4be48374f4 Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Thu, 5 Feb 2015 19:10:05 -0500 Subject: [PATCH 5/9] cleaned up readme markdown for better viewing --- cpp/README.md | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 7d32500..e4ee716 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -1,4 +1,5 @@ Link Communities C++ +-------------------- This directory contains C++ code and helper files to perform link clustering for very large networks (millions of edges). @@ -28,6 +29,7 @@ build the binaries for both steps, or build them separately using the following instructions. To perform the first step: + $ g++ -O5 -o calcJaccards calcAndWrite_Jaccards.cpp $ ./calcJaccards net.pairs net.jaccs @@ -35,30 +37,31 @@ This reads the provided net.pairs and create a (possibly large) net.jaccs file, containing all the link similarities. To record the clusters for a given THRESHOLD: + $ g++ -O5 -o clusterJaccards clusterJaccsFile.cpp $ ./clusterJaccards net.pairs net.jaccs net.clusters net.mc_nc THRESHOLD This will scan the net.jaccs file, record all the clusters at THRESHOLD in net.clusters, and the sizes of each cluster (number of edges and number of induced nodes) to net.mc_nc. The latter is useful for quickly computing the -partition density. - -`partition_density.py` can be used to compute the partition density for a cut, -using the .mc_nc file. +partition density. `partition_density.py` can be used to compute the partition +density for a cut, using the .mc_nc file. Finally, two BASH scripts are provided for convenience: `link_clustering.sh` - compiles and performs the full calculation (both steps), good for single - runs. Try it with the included example .pairs file. For help, run - './link_clustering' (with no arguments) from a terminal. + + Compiles and performs the full calculation (both steps), good for single + runs. Try it with the included example .pairs file. For help, run + './link_clustering' (with no arguments) from a terminal. `loop_thresholds.sh` - If the link similarity (.jaccs) file has already been created, this will - loop over many thresholds, recording the clusters at each. The default - threshold list combs from 0.1 to 0.9 in increments of 0.1. This can be - changed in the file. Run with the -h flag for additional customization - options. + + If the link similarity (.jaccs) file has already been created, this will + loop over many thresholds, recording the clusters at each. The default + threshold list combs from 0.1 to 0.9 in increments of 0.1. This can be + changed in the file. Run with the -h flag for additional customization + options. Good luck! From fce973bcb1157d22f1929267dcea63e46c6878ba Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Fri, 6 Feb 2015 09:02:32 -0500 Subject: [PATCH 6/9] made -j flag (jaccs file) optional --- cpp/loop_thresholds.sh | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/cpp/loop_thresholds.sh b/cpp/loop_thresholds.sh index bf79735..d5759bc 100755 --- a/cpp/loop_thresholds.sh +++ b/cpp/loop_thresholds.sh @@ -31,9 +31,9 @@ OPTIONS: -h Show this message only -p Pairs file. -j Jaccard similarity file for given pairs file. - -s Scripts directory, where jaccard calculation and cluster scripts are -c Alternate name for clustering script; default is clusterJaccards - -o Output directory; defualts to \"clusters\" in cwd; created if absent" + -o Output directory; defualts to \"clusters\" in cwd; created if absent + -s Scripts directory, where jaccard calculation and cluster scripts are" } # Error codes @@ -45,7 +45,7 @@ UNABLE_TO_MAKE_OUTPUT_DIR=603 # Parse command line arguments PAIRS_FILE= JACCS_FILE= -SCRIPTS_DIR=$(pwd) +SCRIPTS_DIR=$(dirname $0) OUTPUT_DIR=$(pwd)/clusters CLUST_SCRIPT="clusterJaccards" @@ -89,7 +89,34 @@ fi if [[ -z "$JACCS_FILE" ]]; then echo "Jaccard similarity file is required to run. None given." - exit $BAD_USAGE + echo "Attempting to calculate jaccard similarities." + CALC_JACCS_SCRIPT=$SCRIPTS_DIR/calcJaccards + if [[ ! -f "$CALC_JACCS_SCRIPT" ]]; then + echo "calcJaccards not found in ${SCRIPTS_DIR}" + echo -n "Attempting to compile from source..." + + CWD=$(pwd) + cd $SCRIPTS_DIR + make calc + cd $CWD + + if [[ ! -f "$CALC_JACCS_SCRIPT" ]] + then + echo " failed." + exit $NO_JACCS_FILE + . + else + echo " success." + fi + fi + echo "Pairs file: ${PAIRS_FILE}" + JACCS_FILE="${PAIRS_FILE%.*}.jaccs" + echo "Writing jaccard similarity file to: ${JACCS_FILE}." + $CALC_JACCS_SCRIPT $PAIRS_FILE $JACCS_FILE + if [[ ! "$?" -eq 0 ]]; then + echo "Jaccard similarity file failed to write. Exiting." + exit $NO_JACCS_FILE + fi fi # Inform the user of our progress. From 91cca288bc76e5957269a29246a147ab10d433c9 Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Fri, 6 Feb 2015 09:15:15 -0500 Subject: [PATCH 7/9] improved error handling --- cpp/loop_thresholds.sh | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/cpp/loop_thresholds.sh b/cpp/loop_thresholds.sh index d5759bc..216a4d3 100755 --- a/cpp/loop_thresholds.sh +++ b/cpp/loop_thresholds.sh @@ -82,11 +82,20 @@ do done # Ensure we have the necessary files -if [[ -z "$PAIRS_FILE" ]]; then +if [[ -z "$PAIRS_FILE" ]] +then echo "Pairs file is required to run. None given." exit $BAD_USAGE + . +else + if [[ ! -f "$PAIRS_FILE" ]]; then + echo "Pairs file not found: ${PAIRS_FILE}" + exit $NO_PAIRS_FILE + fi fi +# If the user didn't pass the jaccs file, we need to calculate it. +# Try a variety of different things before failing. if [[ -z "$JACCS_FILE" ]]; then echo "Jaccard similarity file is required to run. None given." echo "Attempting to calculate jaccard similarities." @@ -97,9 +106,10 @@ if [[ -z "$JACCS_FILE" ]]; then CWD=$(pwd) cd $SCRIPTS_DIR - make calc + make calc > /dev/null 2>&1 cd $CWD + # Make failed for some reason. if [[ ! -f "$CALC_JACCS_SCRIPT" ]] then echo " failed." @@ -107,8 +117,11 @@ if [[ -z "$JACCS_FILE" ]]; then . else echo " success." + . fi fi + + # We should now have the script, so let's try to calculate jaccs file. echo "Pairs file: ${PAIRS_FILE}" JACCS_FILE="${PAIRS_FILE%.*}.jaccs" echo "Writing jaccard similarity file to: ${JACCS_FILE}." @@ -119,6 +132,12 @@ if [[ -z "$JACCS_FILE" ]]; then fi fi +# Now what if the user passed the file, but it doesn't exist? +if [[ ! -f "$JACCS_FILE" ]]; then + echo "Jaccard similarity file not found: ${JACCS_FILE}" + exit $NO_JACCS_FILE +fi + # Inform the user of our progress. echo "Using link community detection scripts from: ${SCRIPTS_DIR}" echo "Writing to output directory: ${OUTPUT_DIR}" From 4c540bb534fe7980ba233f38e492fe5c65d2a15a Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Fri, 6 Feb 2015 10:34:46 -0500 Subject: [PATCH 8/9] clean up dir creation output --- cpp/loop_thresholds.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/loop_thresholds.sh b/cpp/loop_thresholds.sh index 216a4d3..d2a3893 100755 --- a/cpp/loop_thresholds.sh +++ b/cpp/loop_thresholds.sh @@ -144,11 +144,15 @@ echo "Writing to output directory: ${OUTPUT_DIR}" # Make output directory if it does not exist if ! [[ -d $OUTPUT_DIR ]]; then - echo -n "Output directory does not exist. Attempting to create... " + echo -n "Output directory does not exist. Attempting to create..." mkdir $OUTPUT_DIR - if [[ $? != 0 ]]; then - echo "failed." + if [[ $? != 0 ]] + then + echo " failed." exit $UNABLE_TO_MAKE_OUTPUT_DIR; + . + else + echo " success." fi fi From da5009421fe68d2fed96d8fc2ac115d931a1b170 Mon Sep 17 00:00:00 2001 From: Mack Sweeney Date: Wed, 11 Feb 2015 16:18:26 -0500 Subject: [PATCH 9/9] added a script to convert link comm files to lines of nodes --- cpp/links_to_nodes.py | 77 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 cpp/links_to_nodes.py diff --git a/cpp/links_to_nodes.py b/cpp/links_to_nodes.py new file mode 100644 index 0000000..6bb3de5 --- /dev/null +++ b/cpp/links_to_nodes.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +This is a script intended to translate clusters of links (edges) to clusters of +nodes. Specifically, it takes a file of clusters, one per line, where each +cluster (line) is a list of edge pairs separated by spaces. Each edge pair +consists of the two endpoint node IDs, separated by a comma. The output file +contains the same clusters, but broken down into nodes instead of edges. This is +done by extracting the node IDs from all edges for a cluster and putting them +into a set, then writing that set of unique node IDs to the line in the output +file to represent the cluster. + +""" +import os +import sys + + +def convert_to_nodes(link_clusters_file, outfile=''): + # If no output file is given; modify input filename to produce one. + if not outfile: + outfile = convert_path(link_clusters_file) + + with open(link_clusters_file) as f: + lines = (line.strip() for line in f) + + # The clusters consist of a string of links; split into a list. + link_clusters = (line.split() for line in lines) + + # Now split the links into nodes; we have a list of lists. + nested_nodes = (map(lambda link: link.split(','), link_cluster) + for link_cluster in link_clusters) + + # Reduce the nested lists to 1-D: elements are nodes. + node_lists = (reduce(lambda x,y: x+y, node_lists) + for node_lists in nested_nodes) + + # Remove duplicate nodes from clusters, and we're done. + node_clusters = (set(node_list) for node_list in node_lists) + + # Finally, write the new clusters to the output file. + with open(outfile, 'w') as out: + for cluster in node_clusters: + out.write('%s\n' % ' '.join(cluster)) + + +def convert_path(path): + """Convert input file path to a suitable output file name.""" + if '-link-' in path: # case for standard output naming + return path.replace('-link-', '-node-') + else: + base, ext = os.path.splitext(path) + outfile = '%s-by-node%s' % (base, ext) + + +if __name__ == "__main__": + usage = "%s []" % sys.argv[0] + if len(sys.argv) < 2: + print usage + sys.exit(1) + elif len(sys.argv) > 2: + arg = sys.argv[2] + if arg.startswith('-'): + if arg == '-h' or arg == '--help': + print usage + sys.exit(0) + else: + print 'unkown flag: %s' % arg + sys.exit(3) + + outfile = sys.argv[2] if len(sys.argv) > 2 else '' + + try: + convert_to_nodes(sys.argv[1], outfile) + except IOError: + print "Unable to open input file: %s" % sys.argv[1] + sys.exit(2)