diff --git a/include/bvgraph.h b/include/bvgraph.h index c8b6a30..a707dd2 100644 --- a/include/bvgraph.h +++ b/include/bvgraph.h @@ -31,6 +31,7 @@ #include "bitfile.h" +#include "eflist.h" //#define MAX_DEBUG @@ -89,6 +90,7 @@ struct bvgraph_tag { int window_size; int min_interval_length; int zeta_k; + double bits_per_link; enum bvgraph_compression_flag_tag outdegree_coding; enum bvgraph_compression_flag_tag block_coding; @@ -106,6 +108,8 @@ struct bvgraph_tag { unsigned long long* offsets; int offsets_external; + elias_fano_list ef; + int use_ef; //< 0: not use; 1: use only EF code; }; /** @@ -235,6 +239,7 @@ typedef struct bvgraph_random_iterator_tag bvgraph_random_iterator; typedef struct bvgraph_int_vector_tag bvgraph_int_vector; typedef struct bvgraph_parallel_iterators_tag bvgraph_parallel_iterators; + // define all the error codes extern const int bvgraph_call_out_of_memory; extern const int bvgraph_call_io_error; @@ -292,7 +297,7 @@ int bvgraph_parallel_iterators_free(bvgraph_parallel_iterators *pits); int bvgraph_required_memory(bvgraph *g, - int offset_step, size_t *gbuf, size_t *offsetbuf); + int offset_step, size_t *gbuf, size_t *offset_ef_buf); int merge_int_arrays(const int64_t* a1, size_t a1len, const int64_t* a2, size_t a2len, int64_t *out, size_t outlen); @@ -303,6 +308,13 @@ int add_to_cache(int node, int *links, int d); int loaded_sort(struct successor *a, struct successor *b); struct successor *find_in_cache(int node); +int load_offset_from_file(bvgraph *g); +int load_offset_online(bvgraph *g); + +int load_efcode_from_file(bvgraph *g); +int load_efcode_online(bvgraph *g); +int build_efcode(bvgraph *g); + #ifdef __cplusplus } #endif diff --git a/src/bvgraph.c b/src/bvgraph.c index 3a96f02..c4ecab7 100644 --- a/src/bvgraph.c +++ b/src/bvgraph.c @@ -19,6 +19,7 @@ #include "bvgraph_internal.h" #include "bvgraph_inline_io.h" +#include "eflist.h" /** * Define all the error codes @@ -80,8 +81,15 @@ void bvgraph_free(bvgraph *g) * @param[in] filename the base filename for a set of bvgraph files, * so filename.graph and filename.properties must exist. * @param[in] offset_step controls how many offsets are loaded, - * if offset_step = -1, then the graph file isn't loaded into memory - * if offset_step = 0, then the graph file is loaded, but no offsets + * if offset_step = -1, then the graph file isn't loaded into memory and no offsets; + * if offset_step = 0, then the graph file is loaded, but no offsets; + * if offset_step = 1, then the graph file is loaded, as well as the offsets file; + * if offset_step = 2, then the graph file is loaded, and the offsets file is loaded as an Elias-Fano list + * if offset_step > 2, then the graph file is loaded, and the offset_step value is the amount of memory + * the user wishes to use for the offsets file. If offset_step < memory_need_for_offsets, then offsets file + * is loaded as an Elias-Fano list; + * if offset_step < -1, then the graph file isn't loaded into memory and the + * offsets file is loaded as an Elias-Fano list * no other values are supported at the moment. * @return 0 if successful; * bvgraph_load_error_filename_too_long - indicates the filename was too long @@ -133,8 +141,8 @@ int bvgraph_load_external(bvgraph *g, unsigned long long* offsets, int offsetssize) { int rval = 0; - - assert(offset_step == 0 || offset_step == -1 || offset_step == 1); + // the offset_step can be ANY value now! + //assert(offset_step == 0 || offset_step == -1 || offset_step == 1); if (filenamelen > BVGRAPH_MAX_FILENAME_SIZE-1) { return bvgraph_load_error_filename_too_long; @@ -147,7 +155,6 @@ int bvgraph_load_external(bvgraph *g, g->filenamelen = filenamelen; g->offset_step = offset_step; - set_defaults(g); rval = parse_properties(g); @@ -157,99 +164,112 @@ int bvgraph_load_external(bvgraph *g, // continue processing if (offset_step >= 0) { - if (offset_step == 0 || offset_step == 1) { //modified 082911 - // in this case, we ust load the graph - // file into memory - - // first get the filesize - unsigned long long graphfilesize; - char *gfilename = strappend(g->filename, g->filenamelen, ".graph", 6); - rval = fsize(gfilename, &graphfilesize); + //modified 082911 + // in this case, we must load the graph + // file into memory + + // first get the filesize + unsigned long long graphfilesize; + char *gfilename = strappend(g->filename, g->filenamelen, ".graph", 6); + rval = fsize(gfilename, &graphfilesize); + free(gfilename); + if (rval) { + return bvgraph_call_io_error; + } + + if (gmemory != NULL) { + // they want to use their own memory, make sure + // they allocated enough! + if (gmemsize < graphfilesize) { + return bvgraph_load_error_buffer_too_small; + } + g->memory = gmemory; + g->memory_size = gmemsize; + g->memory_external = 1; + } else { + // we have to allocate the memory ourselves + g->memory_size = (size_t)graphfilesize; + g->memory = malloc(sizeof(unsigned char)*g->memory_size); + if (!g->memory) { + return bvgraph_call_out_of_memory; + } + g->memory_external = 0; + } + + // now read the file + gfilename = strappend(g->filename, g->filenamelen, ".graph", 6); + { + size_t bytesread = 0; + FILE *gfile = fopen(gfilename, "rb"); free(gfilename); - if (rval) { + if (!gfile) { return bvgraph_call_io_error; } + bytesread = fread(g->memory, 1, g->memory_size, gfile); + if (bytesread != graphfilesize) { + return bvgraph_call_io_error; + } + fclose(gfile); + } + // we now have the graph in memory! + g->use_ef = 0; - if (gmemory != NULL) { - // they want to use their own memory, make sure - // they allocated enough! - if (gmemsize < graphfilesize) { - return bvgraph_load_error_buffer_too_small; - } - g->memory = gmemory; - g->memory_size = gmemsize; - g->memory_external = 1; + if (offset_step == 1) { //modified 082911 + if (offsets != NULL) { + g->offsets = offsets; + g->offsets_external = 1; } else { // we have to allocate the memory ourselves - g->memory_size = (size_t)graphfilesize; - g->memory = malloc(sizeof(unsigned char)*g->memory_size); - if (!g->memory) { - return bvgraph_call_out_of_memory; - } - g->memory_external = 0; + g->offsets = (unsigned long long*) malloc(sizeof(unsigned long long)*g->n); + g->offsets_external = 0; } - // now read the file - gfilename = strappend(g->filename, g->filenamelen, ".graph", 6); - { - size_t bytesread = 0; - FILE *gfile = fopen(gfilename, "rb"); - free(gfilename); - if (!gfile) { - return bvgraph_call_io_error; - } - bytesread = fread(g->memory, 1, g->memory_size, gfile); - if (bytesread != graphfilesize) { - return bvgraph_call_io_error; - } - fclose(gfile); + rval = load_offset_from_file(g); + if (rval) { + load_offset_online(g); + } + } + else if (offset_step == 2) { + // for test purpose, only generate efcode for offset + g->use_ef = 1; + g->offsets_external = 1; + build_efcode(g); + } + else if (offset_step > 2){ + // offset_step > 2, the size of memory that a user is willing to allocate + int64_t offset_in_mem = g->n * 8; // amount of memory in bytes + if (offset_in_mem > offset_step) { + // use EF code to ensure minimum memory + g->use_ef = 1; + g->offsets_external = 1; + printf("The memory required for offsets is larger than %d MB.\nLoading with EF code instead.\n", offset_step); + build_efcode(g); } - // we now have the graph in memory! - - if (offset_step == 1) { //modified 082911 - if (offsets != NULL) { - g->offsets = offsets; - g->offsets_external = 1; - } else { - // we have to allocate the memory ourselves - g->offsets = (unsigned long long*) malloc(sizeof(unsigned long long)*g->n); - g->offsets_external = 0; - } - // now read the file - char *ofilename = strappend(g->filename, g->filenamelen, ".offsets", 8); - bitfile bf; - long long off = 0; - int64_t i; - g->offsets = (unsigned long long*)malloc(g->n*sizeof(unsigned long long)); - FILE *ofile = fopen(ofilename, "rb"); - if (ofile) { - rval = bitfile_open(ofile, &bf); - if (rval) { - return bvgraph_call_io_error; - } - for (i = 0; i < g->n; i++){ - off = read_offset(g, &bf) + off; - g->offsets[i] = off; - } - } else { - // need to build the offsets - bvgraph_iterator git; - int rval = bvgraph_nonzero_iterator(g, &git); - if (rval) { return rval; } - g->offsets[0] = 0; - for (; bvgraph_iterator_valid(&git); bvgraph_iterator_next(&git)) { - if (git.curr+1 < g->n) { - g->offsets[git.curr+1] = bitfile_tell(&git.bf); - } - } - bvgraph_iterator_free(&git); + else { + // load the offset + g->use_ef = 0; + rval = load_offset_from_file(g); + if (rval) { + load_offset_online(g); } + g->offsets_external = 0; } + } } - else + else // TODO: add semantics here for offset_step < 0 { - g->memory_size = 0; + if (offset_step == -1) { // graph on disk and no offset + g->memory_size = 0; + } + else { + // leave the graph on disk + // use EF code for the offsets + g->memory_size = 0; + g->use_ef = 1; + g->offsets_external = 1; + build_efcode(g); + } } // presently the othercases are not supported, so we don't have to @@ -274,6 +294,9 @@ int bvgraph_close(bvgraph* g) if (!g->memory_external) { free(g->memory); } if (!g->offsets_external) { free(g->offsets); } memset(g, 0, sizeof(bvgraph)); + if (g->use_ef) { + eflist_free(&(g->ef)); + } return (0); } @@ -289,16 +312,19 @@ int bvgraph_close(bvgraph* g) * @param[in] g the graph * @param[in] offset_step the new offset_step value * @param[out] gbuf the size of the graph buffer - * @param[out] offsetbuf the size of the offset buffer + * @param[out] offset_ef_buf the size of the offset/ef buffer, this depends on the offset_step value * @return 0 on success */ -int bvgraph_required_memory(bvgraph *g, int offset_step, size_t *gbuf, size_t *offsetbuf) +int bvgraph_required_memory(bvgraph *g, int offset_step, size_t *gbuf, size_t *offset_ef_buf) { - if (offset_step < 0) { + if (offset_step <= -1) { if (gbuf) { *gbuf = 0; } - if (offsetbuf) { *offsetbuf = 0; } + if (offset_ef_buf) { *offset_ef_buf = 0; } + if (offset_step < -1) { + *offset_ef_buf = eflist_size(g->n, (uint64_t)g->m * g->bits_per_link, 0); + } } - else if (offset_step < 2) { + else { unsigned long long graphfilesize; char *gfilename = strappend(g->filename, g->filenamelen, ".graph", 6); int rval = fsize(gfilename, &graphfilesize); @@ -310,15 +336,27 @@ int bvgraph_required_memory(bvgraph *g, int offset_step, size_t *gbuf, size_t *o if (gbuf) { *gbuf = (size_t)graphfilesize; } // always set the offsetbuf here even if we are about to change // it. - if (offsetbuf) { *offsetbuf = 0; } + if (offset_ef_buf) { *offset_ef_buf = 0; } if (offset_step == 1) { - if (offsetbuf) { *offsetbuf = sizeof(unsigned long long)*g->n; } + if (offset_ef_buf) { *offset_ef_buf = sizeof(unsigned long long)*g->n; } + } + else if (offset_step == 2) { + *offset_ef_buf = eflist_size(g->n, (uint64_t)g->m * g->bits_per_link, 0); + } + else if (offset_step > 2) { + // check if user allowed memory is enough for offset + if (offset_step * (1L << 20) >= sizeof(unsigned long long)*g->n) { + *offset_ef_buf = sizeof(unsigned long long)*g->n; + } + else { + *offset_ef_buf = eflist_size(g->n, (uint64_t)g->m * g->bits_per_link, 0); + } } } - else { +/* else { return bvgraph_call_unsupported; - } + }*/ return (0); } @@ -427,3 +465,148 @@ const char* bvgraph_error_string(int code) return "unknown error"; } } + + +/** + * This function load the offset array from file on disk. + * + * @param[in] g the graph + * @return 0 on success + */ +int load_offset_from_file(bvgraph *g) +{ + char *ofilename = strappend(g->filename, g->filenamelen, ".offsets", 8); + bitfile bf; + long long off = 0; + int64_t i; + g->offsets = (unsigned long long*)malloc(g->n*sizeof(unsigned long long)); + FILE *ofile = fopen(ofilename, "rb"); + if (ofile) { + int rval = bitfile_open(ofile, &bf); + if (rval) { + return bvgraph_call_io_error; + } + for (i = 0; i < g->n; i++){ + off = read_offset(g, &bf) + off; + g->offsets[i] = off; + } + bitfile_close(&bf); + fclose(ofile); + return 0; // success + } + + return 1; // failure +} + +/** + * This function creates the offset array from the graph in the memory. + * + * @param[in] g the graph + * @return 0 on success + */ +int load_offset_online(bvgraph *g) +{ + // need to build the offsets + bvgraph_iterator git; + int rval = bvgraph_nonzero_iterator(g, &git); + if (rval) { return rval; } + g->offsets[0] = 0; + for (; bvgraph_iterator_valid(&git); bvgraph_iterator_next(&git)) { + if (git.curr+1 < g->n) { + g->offsets[git.curr+1] = bitfile_tell(&git.bf); + } + } + bvgraph_iterator_free(&git); + return 0; +} + +/** + * This function computes the EF code from the offset file on disk. + * + * @param[in] g the graph + * @return 0 on success + */ +int load_efcode_from_file(bvgraph *g) +{ + elias_fano_list *ef = &(g->ef); + uint64_t n = g -> n; + bitfile bf; + long long off = 0; + char *ofilename = strappend(g->filename, g->filenamelen, ".offsets", 8); + FILE *ofile = fopen(ofilename, "rb"); + int64_t i; //last_elm, + int rval; + if (ofile) { //if offsets file exists + rval = bitfile_open(ofile, &bf); + if (rval) { + return bvgraph_call_io_error; + } + // here we build the estimate of last element from the property file + // by g->bits_per_link * g->m, this value is larger than the last element in the array + uint64_t build_last = (uint64_t)(g->bits_per_link * g->m); + eflist_create(ef, n, build_last); + rval = bitfile_open(ofile, &bf); + if (rval) { + return bvgraph_call_io_error; + } + off = 0; + for (i = 0; i < g->n; i ++) { + off = read_offset(g, &bf) + off; + eflist_add(ef, off); + } + bitfile_close(&bf); + fclose(ofile); + return 0; // success + } + return 1; //failure +} + + +/** + * This function computes the EF code from the graph loaded in the memory. + * + * @param[in] g the graph + * @return 0 on success + */ +int load_efcode_online(bvgraph *g) +{ + elias_fano_list *ef = &(g->ef); + uint64_t n = g -> n; + bvgraph_iterator git; + int r = bvgraph_nonzero_iterator(g, &git); + int64_t last_elm; + if (r) { return r; } + last_elm = 0; + // estimate the last element from the propery file by + // g->bits_per_link * g->m + uint64_t build_last = (uint64_t)(g->bits_per_link * g->m); + eflist_create(ef, n, build_last); + r = bvgraph_nonzero_iterator(g, &git); + if (r) { return r; } + eflist_add(ef, 0); + for (; bvgraph_iterator_valid(&git); bvgraph_iterator_next(&git)) { + if (git.curr + 1 < g->n) { + last_elm = bitfile_tell(&git.bf); + eflist_add(ef, last_elm); + } + } + bvgraph_iterator_free(&git); + return 0; +} + +/** + * This function builds an eflist based on the given graph. + * + * @param[in] g the graph + * @return 0 on success + */ + +int build_efcode(bvgraph *g) +{ + int rval = load_efcode_from_file(g); + if (rval) { + load_efcode_online(g); + } + return (0); +} + diff --git a/src/bvgraph_iterator.c b/src/bvgraph_iterator.c index abd4ea8..e7a0c31 100644 --- a/src/bvgraph_iterator.c +++ b/src/bvgraph_iterator.c @@ -70,7 +70,7 @@ int bvgraph_nonzero_iterator(bvgraph* g, bvgraph_iterator *i) i->curr_outd = -1; i->cyclic_buffer_size = i->g->window_size+1; - if (g->offset_step == -1) { + if (g->offset_step < 0) { char *graphfilename = strappend(g->filename, g->filenamelen, ".graph", 6); FILE *f = fopen(graphfilename, "rb"); free(graphfilename); @@ -78,10 +78,8 @@ int bvgraph_nonzero_iterator(bvgraph* g, bvgraph_iterator *i) rval = bitfile_open(f,&i->bf); if (rval) { return rval; } - } else if (g->offset_step == 0 || g->offset_step == 1) { + } else {//g->offset_step == 0 || g->offset_step == 1 rval = bitfile_map(g->memory, g->memory_size, &i->bf); - } else { - return bvgraph_call_unsupported; } // beyond this point, the bitfile was successfully allocated, so we must @@ -176,16 +174,26 @@ int bvgraph_random_access_iterator(bvgraph* g, bvgraph_random_iterator *i) // for successors cache i->successors_cache = NULL; - if (g->offset_step < 1) { + if (g->offset_step == -1 || g->offset_step == 0) { // the offset is not avaiable for offset_step = -1 or 0 return bvgraph_call_unsupported; } - - rval = bitfile_map(g->memory, g->memory_size, &i->bf); - rval |= bitfile_map(g->memory, g->memory_size, &i->outd_bf); + if (g->offset_step >= 1) { + rval = bitfile_map(g->memory, g->memory_size, &i->bf); + rval |= bitfile_map(g->memory, g->memory_size, &i->outd_bf); + } + else if (g->offset_step < -1) { + char *ofilename = strappend(g->filename, g->filenamelen, ".graph", 6); + FILE *ofile1 = fopen(ofilename, "rb"); + FILE *ofile2 = fopen(ofilename, "rb"); + if (ofile1 && ofile2) { + rval = bitfile_open(ofile1, &i->bf); + rval |= bitfile_open(ofile2, &i->outd_bf); + } + } // TODO deallocate these on failure - i->offset_step = 1; + i->offset_step = g->offset_step; // 1 // beyond this point, the bitfile was successfully allocated, so we must // deallocate it if we exit. diff --git a/src/bvgraph_random.c b/src/bvgraph_random.c index 1ca8c59..210a93a 100644 --- a/src/bvgraph_random.c +++ b/src/bvgraph_random.c @@ -64,16 +64,22 @@ static int position_bvgraph(bvgraph_random_iterator *ri, int64_t x, uint64_t *d) return (bvgraph_vertex_out_of_range); } - if (ri->offset_step <= 0) { + if (ri->offset_step == 0 || ri->offset_step == -1) { return bvgraph_requires_offsets; - } else if (ri->offset_step == 1) { - int rval = bitfile_position(&ri->bf, ri->g->offsets[x]); + } else { + int64_t offset; + int rval; + if (!ri->g->use_ef) { + offset = ri->g->offsets[x]; + } + else { + offset = eflist_get(&(ri->g->ef), x); + } + rval = bitfile_position(&ri->bf, offset);//ri->g->offsets[x] if (rval == 0) { *d = read_outdegree(ri->g, &ri->bf); } return rval; - } else { - return bvgraph_call_unsupported; } } @@ -95,13 +101,20 @@ int bvgraph_random_outdegree(bvgraph_random_iterator *ri, } // TODO: always add outd_cache search - if (ri->offset_step <= 0) { + if (ri->offset_step == 0 || ri->offset_step == -1) { return (bvgraph_requires_offsets); - } else if (ri->offset_step == 1) { - bitfile_position(&ri->outd_bf, ri->g->offsets[i]); + } else { + int64_t offset; + if (!ri->g->use_ef) { + offset = ri->g->offsets[i]; + } + else { + offset = eflist_get(&(ri->g->ef), i); + } + bitfile_position(&ri->outd_bf, offset); //ri->g->offsets[i] *d = read_outdegree(ri->g, &ri->outd_bf); return (0); - } else { + } //else { // code for the case when offset_step > 1 // check if its in the outd cache // if (i >= ri->cache_start && i < ri->outd_cache_end) { @@ -113,8 +126,8 @@ int bvgraph_random_outdegree(bvgraph_random_iterator *ri, // *d = read_outdegree(ri->g, &ri->outd_bf); // return (0); // } - return (bvgraph_call_unsupported); - } + //return (bvgraph_call_unsupported); + //} } /** Access the successors of a vertex. @@ -136,7 +149,7 @@ int bvgraph_random_successors(bvgraph_random_iterator *ri, if (x<0 || x >= ri->g->n) { return (bvgraph_vertex_out_of_range); } - else if (ri->offset_step <= 0) { + else if (ri->offset_step == 0 || ri->offset_step == -1) { return (bvgraph_requires_offsets); } else { int64_t ref, ref_index; @@ -150,6 +163,7 @@ int bvgraph_random_successors(bvgraph_random_iterator *ri, uint64_t d; //degree ri->curr = x; int rval = position_bvgraph(ri, x, &d); + //printf("after position_graph()\n"); if (rval) { return (rval); } @@ -195,7 +209,6 @@ int bvgraph_random_successors(bvgraph_random_iterator *ri, // method to set the file pointer, this could modify ri itself. // this step requires offsets // - ri->outd_cache[x%ri->cyclic_buffer_size] = d; ri->curr_outd = d; diff --git a/src/properties.c b/src/properties.c index e026b6d..62004e4 100644 --- a/src/properties.c +++ b/src/properties.c @@ -497,6 +497,10 @@ int parse_properties(bvgraph* g) rval = bvgraph_unsupported_version; } } + else if (strncmp(key, "bitsperlink",key_len) == 0) + { + g->bits_per_link = atof(value); + } // release the memory free(key); free(value);