diff --git a/cpp/Makefile b/cpp/Makefile new file mode 100644 index 0000000..cba6b28 --- /dev/null +++ b/cpp/Makefile @@ -0,0 +1,22 @@ +SHELL = /bin/sh +CC = g++ +FLAGS = -O5 +THRESH= 0.5 + +all: calc clust + +calc: calcAndWrite_Jaccards.cpp + $(CC) $(FLAGS) -o calcJaccards calcAndWrite_Jaccards.cpp + +clust: clusterJaccsFile.cpp + $(CC) $(FLAGS) -o clusterJaccards clusterJaccsFile.cpp + +karate: calc clust + ./calcJaccards karate.pairs karate.jaccs + ./clusterJaccards karate.pairs karate.jaccs \ + karate.clusters_$(THRESH) karate.cluster_stats_$(THRESH) $(THRESH) + +lesmis: calc clust + ./calcJaccards lesmis.pairs lesmis.jaccs + ./clusterJaccards lesmis.pairs lesmis.jaccs \ + lesmis.clsuters_$(THRESH) lesmis.cluster_stats_$(THRESH) $(THRESH) diff --git a/cpp/README b/cpp/README deleted file mode 100644 index 2049017..0000000 --- a/cpp/README +++ /dev/null @@ -1,62 +0,0 @@ -Link Communities C++ - -This directory contains C++ code and helper files to perform link clustering for very -large networks (millions of edges). - -Since this code is optimized for size and speed, it is unable to store the full -dendrogram, nor compute the maximum partition density. To avoid memory issues, a two-step -process is used: a (possibly very very large) file containing all the link similarities -is created, then scanned over and all edge pairs beyond a given similarity threshold are -clustered (single linkage). You can estimate the maximum partition density by providing -many thresholds (the similarity file need only be computed once). A shell script is -provided to help perform this loop. - -The given network must be connected and undirected. Node IDs must be sequentially ordered -integers starting from zero. The network must be stored as an 'edgelist' where each edge -is on its own line of the input file and consists of two integers (the edge's nodes) -separated by a space. This format (edgelist with sequential integer nodes) is referred to -as a '.pairs' file. - -For convenience, the file "edgelist2pairs.py" can convert an arbitrary edgelist with any -node ids to the .pairs format. To see how to use it, run - $ python edgelist2pairs.py -h - - -As mentioned, the code consists of two steps: calculate the similarities and then cluster -the edges using a given similarity threshold. - -To perform the first step: - $ g++ -O5 -o calcJaccards calcAndWrite_Jaccards.cpp - $ ./calcJaccards net.pairs net.jaccs - -This reads the provided net.pairs and create a (possibly large) net.jaccs file, -containing all the link similarities. - -To record the clusters for a given THRESHOLD: - $ g++ -O5 -o clusterJaccards clusterJaccsFile.cpp - $ ./clusterJaccards net.pairs net.jaccs net.clusters net.mc_nc THRESHOLD - -will scan the net.jaccs file, record all the clusters at THRESHOLD in net.clusters, and -the sizes of each cluster (number of edges and number of induced nodes) to net.mc_nc. The -latter is useful for quickly computing the partition density. - -partition_density.py can be used to compute the partition density for a cut, using the -.mc_nc file. - - -Finally, two BASH scripts are provided for convenience: - - link_clustering.sh - compiles and performs the full calculation (both steps), good for -single runs. Try it with the included example .pairs file. For help, run -'./link_clustering' (with no arguments) from a terminal. - - loop_thresholds.sh - if the link similarity (.jaccs) file has already been created, this -will loop over many thresholds, recording the clusters at each. This script will need -editing to change some variables, and possibly replace the list of thresholds to be -looped over. - - -Good luck! - --- Jim Bagrow, bagrowjp [at] gmail [dot] com - diff --git a/cpp/README.md b/cpp/README.md new file mode 100644 index 0000000..e4ee716 --- /dev/null +++ b/cpp/README.md @@ -0,0 +1,68 @@ +Link Communities C++ +-------------------- + +This directory contains C++ code and helper files to perform link clustering for very +large networks (millions of edges). + +Since this code is optimized for size and speed, it is unable to store the full +dendrogram, nor compute the maximum partition density. To avoid memory issues, a +two-step process is used: a (possibly very very large) file containing all the +link similarities is created, then scanned over, and all edge pairs beyond a +given similarity threshold are clustered (single linkage). You can estimate the +maximum partition density by providing many thresholds (the similarity file need +only be computed once). A shell script is provided to help perform this loop. + +The given network must be connected and undirected. Node IDs must be +sequentially ordered integers starting from zero. The network must be stored as +an 'edgelist' where each edge is on its own line of the input file and consists +of two integers (the edge's nodes) separated by a space. This format (edgelist +with sequential integer nodes) is referred to as a '.pairs' file. + +For convenience, the file `edgelist2pairs.py` can convert an arbitrary edgelist +with any node ids to the .pairs format. To see how to use it, run + + $ python edgelist2pairs.py -h + +As mentioned, the code consists of two steps: calculate the similarities and +then cluster the edges using a given similarity threshold. You can run `make` to +build the binaries for both steps, or build them separately using the following +instructions. + +To perform the first step: + + $ g++ -O5 -o calcJaccards calcAndWrite_Jaccards.cpp + $ ./calcJaccards net.pairs net.jaccs + +This reads the provided net.pairs and create a (possibly large) net.jaccs file, +containing all the link similarities. + +To record the clusters for a given THRESHOLD: + + $ g++ -O5 -o clusterJaccards clusterJaccsFile.cpp + $ ./clusterJaccards net.pairs net.jaccs net.clusters net.mc_nc THRESHOLD + +This will scan the net.jaccs file, record all the clusters at THRESHOLD in +net.clusters, and the sizes of each cluster (number of edges and number of +induced nodes) to net.mc_nc. The latter is useful for quickly computing the +partition density. `partition_density.py` can be used to compute the partition +density for a cut, using the .mc_nc file. + +Finally, two BASH scripts are provided for convenience: + + `link_clustering.sh` + + Compiles and performs the full calculation (both steps), good for single + runs. Try it with the included example .pairs file. For help, run + './link_clustering' (with no arguments) from a terminal. + + `loop_thresholds.sh` + + If the link similarity (.jaccs) file has already been created, this will + loop over many thresholds, recording the clusters at each. The default + threshold list combs from 0.1 to 0.9 in increments of 0.1. This can be + changed in the file. Run with the -h flag for additional customization + options. + +Good luck! + +-- Jim Bagrow, bagrowjp [at] gmail [dot] com diff --git a/cpp/links_to_nodes.py b/cpp/links_to_nodes.py new file mode 100644 index 0000000..6bb3de5 --- /dev/null +++ b/cpp/links_to_nodes.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +This is a script intended to translate clusters of links (edges) to clusters of +nodes. Specifically, it takes a file of clusters, one per line, where each +cluster (line) is a list of edge pairs separated by spaces. Each edge pair +consists of the two endpoint node IDs, separated by a comma. The output file +contains the same clusters, but broken down into nodes instead of edges. This is +done by extracting the node IDs from all edges for a cluster and putting them +into a set, then writing that set of unique node IDs to the line in the output +file to represent the cluster. + +""" +import os +import sys + + +def convert_to_nodes(link_clusters_file, outfile=''): + # If no output file is given; modify input filename to produce one. + if not outfile: + outfile = convert_path(link_clusters_file) + + with open(link_clusters_file) as f: + lines = (line.strip() for line in f) + + # The clusters consist of a string of links; split into a list. + link_clusters = (line.split() for line in lines) + + # Now split the links into nodes; we have a list of lists. + nested_nodes = (map(lambda link: link.split(','), link_cluster) + for link_cluster in link_clusters) + + # Reduce the nested lists to 1-D: elements are nodes. + node_lists = (reduce(lambda x,y: x+y, node_lists) + for node_lists in nested_nodes) + + # Remove duplicate nodes from clusters, and we're done. + node_clusters = (set(node_list) for node_list in node_lists) + + # Finally, write the new clusters to the output file. + with open(outfile, 'w') as out: + for cluster in node_clusters: + out.write('%s\n' % ' '.join(cluster)) + + +def convert_path(path): + """Convert input file path to a suitable output file name.""" + if '-link-' in path: # case for standard output naming + return path.replace('-link-', '-node-') + else: + base, ext = os.path.splitext(path) + outfile = '%s-by-node%s' % (base, ext) + + +if __name__ == "__main__": + usage = "%s []" % sys.argv[0] + if len(sys.argv) < 2: + print usage + sys.exit(1) + elif len(sys.argv) > 2: + arg = sys.argv[2] + if arg.startswith('-'): + if arg == '-h' or arg == '--help': + print usage + sys.exit(0) + else: + print 'unkown flag: %s' % arg + sys.exit(3) + + outfile = sys.argv[2] if len(sys.argv) > 2 else '' + + try: + convert_to_nodes(sys.argv[1], outfile) + except IOError: + print "Unable to open input file: %s" % sys.argv[1] + sys.exit(2) diff --git a/cpp/loop_thresholds.sh b/cpp/loop_thresholds.sh index 6268190..d2a3893 100755 --- a/cpp/loop_thresholds.sh +++ b/cpp/loop_thresholds.sh @@ -21,17 +21,150 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +usage() { + echo 1>&2 "USAGE: $0 [options] +Loop over many thresholds, recording the clusters at each. Must pass arguments +for -p and -j. -# these need to be updated by you: -EXEC=./clusterJaccards -NET=karate.pairs -JACC=karate.jaccs -OUTDIR=clusters +OPTIONS: + -h Show this message only + -p Pairs file. + -j Jaccard similarity file for given pairs file. + -c Alternate name for clustering script; default is clusterJaccards + -o Output directory; defualts to \"clusters\" in cwd; created if absent + -s Scripts directory, where jaccard calculation and cluster scripts are" +} + +# Error codes +BAD_USAGE=600 +NO_PAIRS_FILE=601 +NO_JACCS_FILE=602 +UNABLE_TO_MAKE_OUTPUT_DIR=603 + +# Parse command line arguments +PAIRS_FILE= +JACCS_FILE= +SCRIPTS_DIR=$(dirname $0) +OUTPUT_DIR=$(pwd)/clusters +CLUST_SCRIPT="clusterJaccards" + +while (( "$#" )) +do + # For flags with values, shift is used in-line to move past the value. + case $1 in + -h|--help) + usage; + exit 0 + ;; + -p|--pairs-file) + PAIRS_FILE="$2" + shift + ;; + -j|--jacc-sim-file) + JACCS_FILE="$2" + shift + ;; + -s|--scripts-dir) + SCRIPTS_DIR=$2 + shift + ;; + -c|--cluster-script) + CLUST_SCRIPT=$2 + shift + ;; + -o|--output-dir) + OUTPUT_DIR=$2 + shift + ;; + esac + shift # decrement all arglist indices +done + +# Ensure we have the necessary files +if [[ -z "$PAIRS_FILE" ]] +then + echo "Pairs file is required to run. None given." + exit $BAD_USAGE + . +else + if [[ ! -f "$PAIRS_FILE" ]]; then + echo "Pairs file not found: ${PAIRS_FILE}" + exit $NO_PAIRS_FILE + fi +fi + +# If the user didn't pass the jaccs file, we need to calculate it. +# Try a variety of different things before failing. +if [[ -z "$JACCS_FILE" ]]; then + echo "Jaccard similarity file is required to run. None given." + echo "Attempting to calculate jaccard similarities." + CALC_JACCS_SCRIPT=$SCRIPTS_DIR/calcJaccards + if [[ ! -f "$CALC_JACCS_SCRIPT" ]]; then + echo "calcJaccards not found in ${SCRIPTS_DIR}" + echo -n "Attempting to compile from source..." + + CWD=$(pwd) + cd $SCRIPTS_DIR + make calc > /dev/null 2>&1 + cd $CWD + + # Make failed for some reason. + if [[ ! -f "$CALC_JACCS_SCRIPT" ]] + then + echo " failed." + exit $NO_JACCS_FILE + . + else + echo " success." + . + fi + fi + + # We should now have the script, so let's try to calculate jaccs file. + echo "Pairs file: ${PAIRS_FILE}" + JACCS_FILE="${PAIRS_FILE%.*}.jaccs" + echo "Writing jaccard similarity file to: ${JACCS_FILE}." + $CALC_JACCS_SCRIPT $PAIRS_FILE $JACCS_FILE + if [[ ! "$?" -eq 0 ]]; then + echo "Jaccard similarity file failed to write. Exiting." + exit $NO_JACCS_FILE + fi +fi + +# Now what if the user passed the file, but it doesn't exist? +if [[ ! -f "$JACCS_FILE" ]]; then + echo "Jaccard similarity file not found: ${JACCS_FILE}" + exit $NO_JACCS_FILE +fi + +# Inform the user of our progress. +echo "Using link community detection scripts from: ${SCRIPTS_DIR}" +echo "Writing to output directory: ${OUTPUT_DIR}" + +# Make output directory if it does not exist +if ! [[ -d $OUTPUT_DIR ]]; then + echo -n "Output directory does not exist. Attempting to create..." + mkdir $OUTPUT_DIR + if [[ $? != 0 ]] + then + echo " failed." + exit $UNABLE_TO_MAKE_OUTPUT_DIR; + . + else + echo " success." + fi +fi + +# Set up variables for running the clustering script. This is an example: # $EXEC network.pairs network.jaccs network.clusters network.cluster_stats threshold +EXEC="${SCRIPTS_DIR}/${CLUST_SCRIPT}" for thr in 0.9 0.8 0.7 0.6 0.5 0.4 0.3 0.2 0.1 do - echo $thr - $EXEC $NET $JACC $OUTDIR/network_$thr.cluster $OUTDIR/network_$thr.cluster_stats $thr + echo "Threshold: ${thr}" + $EXEC $PAIRS_FILE $JACCS_FILE \ + $OUTPUT_DIR/network_$thr.cluster \ + $OUTPUT_DIR/network_$thr.cluster_stats \ + $thr done