bagrow · macks22 · Feb 5, 2015 · Feb 5, 2015 · Feb 6, 2015 · Feb 6, 2015
diff --git a/cpp/Makefile b/cpp/Makefile
@@ -0,0 +1,22 @@
+SHELL = /bin/sh
+CC    = g++
+FLAGS = -O5
+THRESH= 0.5
+
+all: calc clust
+
+calc: calcAndWrite_Jaccards.cpp
+	$(CC) $(FLAGS) -o calcJaccards calcAndWrite_Jaccards.cpp
+
+clust: clusterJaccsFile.cpp
+	$(CC) $(FLAGS) -o clusterJaccards clusterJaccsFile.cpp
+
+karate: calc clust
+	./calcJaccards karate.pairs karate.jaccs
+	./clusterJaccards karate.pairs karate.jaccs \
+		karate.clusters_$(THRESH) karate.cluster_stats_$(THRESH) $(THRESH)
+
+lesmis: calc clust
+	./calcJaccards lesmis.pairs lesmis.jaccs
+	./clusterJaccards lesmis.pairs lesmis.jaccs \
+		lesmis.clsuters_$(THRESH) lesmis.cluster_stats_$(THRESH) $(THRESH)
diff --git a/cpp/README b/cpp/README
diff --git a/cpp/README.md b/cpp/README.md
@@ -0,0 +1,68 @@
+Link Communities C++
+--------------------
+
+This directory contains C++ code and helper files to perform link clustering for very
+large networks (millions of edges).
+
+Since this code is optimized for size and speed, it is unable to store the full
+dendrogram, nor compute the maximum partition density. To avoid memory issues, a
+two-step process is used: a (possibly very very large) file containing all the
+link similarities is created, then scanned over, and all edge pairs beyond a
+given similarity threshold are clustered (single linkage). You can estimate the
+maximum partition density by providing many thresholds (the similarity file need
+only be computed once). A shell script is provided to help perform this loop.
+
+The given network must be connected and undirected. Node IDs must be
+sequentially ordered integers starting from zero. The network must be stored as
+an 'edgelist' where each edge is on its own line of the input file and consists
+of two integers (the edge's nodes) separated by a space. This format (edgelist
+with sequential integer nodes) is referred to as a '.pairs' file.
+
+For convenience, the file `edgelist2pairs.py` can convert an arbitrary edgelist
+with any node ids to the .pairs format. To see how to use it, run
+
+    $ python edgelist2pairs.py -h
+
+As mentioned, the code consists of two steps: calculate the similarities and
+then cluster the edges using a given similarity threshold. You can run `make` to
+build the binaries for both steps, or build them separately using the following
+instructions.
+
+To perform the first step:
+
+	$ g++ -O5 -o calcJaccards calcAndWrite_Jaccards.cpp
+	$ ./calcJaccards net.pairs net.jaccs
+
+This reads the provided net.pairs and create a (possibly large) net.jaccs file,
+containing all the link similarities.
+
+To record the clusters for a given THRESHOLD:
+
+	$ g++ -O5 -o clusterJaccards clusterJaccsFile.cpp
+	$ ./clusterJaccards net.pairs net.jaccs net.clusters net.mc_nc THRESHOLD
+
+This will scan the net.jaccs file, record all the clusters at THRESHOLD in
+net.clusters, and the sizes of each cluster (number of edges and number of
+induced nodes) to net.mc_nc. The latter is useful for quickly computing the
+partition density. `partition_density.py` can be used to compute the partition
+density for a cut, using the .mc_nc file.
+
+Finally, two BASH scripts are provided for convenience:
+
+ `link_clustering.sh`
+
+  Compiles and performs the full calculation (both steps), good for single
+  runs. Try it with the included example .pairs file. For help, run
+  './link_clustering' (with no arguments) from a terminal.
+
+ `loop_thresholds.sh`
+
+  If the link similarity (.jaccs) file has already been created, this will
+  loop over many thresholds, recording the clusters at each. The default
+  threshold list combs from 0.1 to 0.9 in increments of 0.1. This can be
+  changed in the file. Run with the -h flag for additional customization
+  options.
+
+Good luck!
+
+-- Jim Bagrow, bagrowjp [at] gmail [dot] com
diff --git a/cpp/links_to_nodes.py b/cpp/links_to_nodes.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+"""
+This is a script intended to translate clusters of links (edges) to clusters of
+nodes. Specifically, it takes a file of clusters, one per line, where each
+cluster (line) is a list of edge pairs separated by spaces. Each edge pair
+consists of the two endpoint node IDs, separated by a comma. The output file
+contains the same clusters, but broken down into nodes instead of edges. This is
+done by extracting the node IDs from all edges for a cluster and putting them
+into a set, then writing that set of unique node IDs to the line in the output
+file to represent the cluster.
+
+"""
+import os
+import sys
+
+
+def convert_to_nodes(link_clusters_file, outfile=''):
+    # If no output file is given; modify input filename to produce one.
+    if not outfile:
+        outfile = convert_path(link_clusters_file)
+
+    with open(link_clusters_file) as f:
+        lines = (line.strip() for line in f)
+
+        # The clusters consist of a string of links; split into a list.
+        link_clusters = (line.split() for line in lines)
+
+        # Now split the links into nodes; we have a list of lists.
+        nested_nodes = (map(lambda link: link.split(','), link_cluster)
+                       for link_cluster in link_clusters)
+
+        # Reduce the nested lists to 1-D: elements are nodes.
+        node_lists = (reduce(lambda x,y: x+y, node_lists)
+                     for node_lists in nested_nodes)
+
+        # Remove duplicate nodes from clusters, and we're done.
+        node_clusters = (set(node_list) for node_list in node_lists)
+
+        # Finally, write the new clusters to the output file.
+        with open(outfile, 'w') as out:
+            for cluster in node_clusters:
+                out.write('%s\n' % ' '.join(cluster))
+
+
+def convert_path(path):
+    """Convert input file path to a suitable output file name."""
+    if '-link-' in path:  # case for standard output naming
+        return path.replace('-link-', '-node-')
+    else:
+        base, ext = os.path.splitext(path)
+        outfile = '%s-by-node%s' % (base, ext)
+
+
+if __name__ == "__main__":
+    usage = "%s <link-cluster-file> [<output-file>]" % sys.argv[0]
+    if len(sys.argv) < 2:
+        print usage
+        sys.exit(1)
+    elif len(sys.argv) > 2:
+        arg = sys.argv[2]
+        if arg.startswith('-'):
+            if arg == '-h' or arg == '--help':
+                print usage
+                sys.exit(0)
+            else:
+                print 'unkown flag: %s' % arg
+                sys.exit(3)
+
+    outfile = sys.argv[2] if len(sys.argv) > 2 else ''
+
+    try:
+        convert_to_nodes(sys.argv[1], outfile)
+    except IOError:
+        print "Unable to open input file: %s" % sys.argv[1]
+        sys.exit(2)
diff --git a/cpp/loop_thresholds.sh b/cpp/loop_thresholds.sh
@@ -21,17 +21,150 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+usage() {
+    echo 1>&2 "USAGE: $0 [options]
 
+Loop over many thresholds, recording the clusters at each. Must pass arguments
+for -p and -j.
 
-# these need to be updated by you:
-EXEC=./clusterJaccards
-NET=karate.pairs
-JACC=karate.jaccs
-OUTDIR=clusters
+OPTIONS:
+   -h    Show this message only
+   -p    Pairs file.
+   -j    Jaccard similarity file for given pairs file.
+   -c    Alternate name for clustering script; default is clusterJaccards
+   -o    Output directory; defualts to \"clusters\" in cwd; created if absent
+   -s    Scripts directory, where jaccard calculation and cluster scripts are"
+}
+
+# Error codes
+BAD_USAGE=600
+NO_PAIRS_FILE=601
+NO_JACCS_FILE=602
+UNABLE_TO_MAKE_OUTPUT_DIR=603
+
+# Parse command line arguments
+PAIRS_FILE=
+JACCS_FILE=
+SCRIPTS_DIR=$(dirname $0)
+OUTPUT_DIR=$(pwd)/clusters
+CLUST_SCRIPT="clusterJaccards"
+
+while (( "$#" ))
+do
+    # For flags with values, shift is used in-line to move past the value.
+    case $1 in
+        -h|--help)
+            usage;
+            exit 0
+            ;;
+        -p|--pairs-file)
+            PAIRS_FILE="$2"
+            shift
+            ;;
+        -j|--jacc-sim-file)
+            JACCS_FILE="$2"
+            shift
+            ;;
+        -s|--scripts-dir)
+            SCRIPTS_DIR=$2
+            shift
+            ;;
+        -c|--cluster-script)
+            CLUST_SCRIPT=$2
+            shift
+            ;;
+        -o|--output-dir)
+            OUTPUT_DIR=$2
+            shift
+            ;;
+    esac
+    shift # decrement all arglist indices
+done
+
+# Ensure we have the necessary files
+if [[ -z "$PAIRS_FILE" ]]
+then
+    echo "Pairs file is required to run. None given."
+    exit $BAD_USAGE
+    .
+else
+    if [[ ! -f "$PAIRS_FILE" ]]; then
+        echo "Pairs file not found: ${PAIRS_FILE}"
+        exit $NO_PAIRS_FILE
+    fi
+fi
+
+# If the user didn't pass the jaccs file, we need to calculate it.
+# Try a variety of different things before failing.
+if [[ -z "$JACCS_FILE" ]]; then
+    echo "Jaccard similarity file is required to run. None given."
+    echo "Attempting to calculate jaccard similarities."
+    CALC_JACCS_SCRIPT=$SCRIPTS_DIR/calcJaccards
+    if [[ ! -f "$CALC_JACCS_SCRIPT" ]]; then
+        echo "calcJaccards not found in ${SCRIPTS_DIR}"
+        echo -n "Attempting to compile from source..."
+
+        CWD=$(pwd)
+        cd $SCRIPTS_DIR
+        make calc > /dev/null 2>&1
+        cd $CWD
+
+        # Make failed for some reason.
+        if [[ ! -f "$CALC_JACCS_SCRIPT" ]]
+        then
+            echo " failed."
+            exit $NO_JACCS_FILE
+            .
+        else
+            echo " success."
+            .
+        fi
+    fi
+
+    # We should now have the script, so let's try to calculate jaccs file.
+    echo "Pairs file: ${PAIRS_FILE}"
+    JACCS_FILE="${PAIRS_FILE%.*}.jaccs"
+    echo "Writing jaccard similarity file to: ${JACCS_FILE}."
+    $CALC_JACCS_SCRIPT $PAIRS_FILE $JACCS_FILE
+    if [[ ! "$?" -eq 0 ]]; then
+        echo "Jaccard similarity file failed to write. Exiting."
+        exit $NO_JACCS_FILE
+    fi
+fi
+
+# Now what if the user passed the file, but it doesn't exist?
+if [[ ! -f "$JACCS_FILE" ]]; then
+    echo "Jaccard similarity file not found: ${JACCS_FILE}"
+    exit $NO_JACCS_FILE
+fi
+
+# Inform the user of our progress.
+echo "Using link community detection scripts from: ${SCRIPTS_DIR}"
+echo "Writing to output directory: ${OUTPUT_DIR}"
+
+# Make output directory if it does not exist
+if ! [[ -d $OUTPUT_DIR ]]; then
+    echo -n "Output directory does not exist. Attempting to create..."
+    mkdir $OUTPUT_DIR
+    if [[ $? != 0 ]]
+    then
+        echo " failed."
+        exit $UNABLE_TO_MAKE_OUTPUT_DIR;
+        .
+    else
+        echo " success."
+    fi
+fi
+
+# Set up variables for running the clustering script. This is an example:
 # $EXEC network.pairs network.jaccs network.clusters network.cluster_stats threshold
+EXEC="${SCRIPTS_DIR}/${CLUST_SCRIPT}"
 
 for thr in 0.9 0.8 0.7 0.6 0.5 0.4 0.3 0.2 0.1
 do
-    echo $thr
-    $EXEC $NET $JACC $OUTDIR/network_$thr.cluster $OUTDIR/network_$thr.cluster_stats $thr
+    echo "Threshold: ${thr}"
+    $EXEC $PAIRS_FILE $JACCS_FILE \
+        $OUTPUT_DIR/network_$thr.cluster \
+        $OUTPUT_DIR/network_$thr.cluster_stats \
+        $thr
 done