From 802560bb43a98c1eefa7565d95dcc309655b7cbb Mon Sep 17 00:00:00 2001 From: Justin Wozniak Date: Mon, 13 May 2019 11:12:48 -0500 Subject: [PATCH] Better tracing for enumerate bugs (ref #174) --- lb/code/src/adlb.c | 21 ++++++++++------- lb/code/src/adlb_types.h | 3 ++- lb/code/src/data.c | 35 +++++++++++++++------------- lb/code/src/debug.h | 9 +++---- lb/code/src/handlers.c | 20 +++++++--------- lb/code/src/messaging.h | 8 ++++--- turbine/code/src/tcl/adlb/tcl-adlb.c | 12 ++++++---- 7 files changed, 60 insertions(+), 48 deletions(-) diff --git a/lb/code/src/adlb.c b/lb/code/src/adlb.c index c022a54ad..5e5582866 100644 --- a/lb/code/src/adlb.c +++ b/lb/code/src/adlb.c @@ -1717,6 +1717,9 @@ ADLBP_Enumerate(adlb_datum_id container_id, void** data, size_t* length, int* records, adlb_type_extra *kv_type) { + TRACE("id=%"PRId64" offset=%i count=%i", + container_id, offset, count); + MPI_Status status; MPI_Request request; @@ -1732,28 +1735,28 @@ ADLBP_Enumerate(adlb_datum_id container_id, opts.offset = offset; opts.decr = decr; - struct packed_enumerate_result res; - IRECV(&res, sizeof(res), MPI_BYTE, to_server_rank, ADLB_TAG_RESPONSE); + struct packed_enumerate_result result; + IRECV(&result, sizeof(result), MPI_BYTE, to_server_rank, ADLB_TAG_RESPONSE); SEND(&opts, sizeof(struct packed_enumerate), MPI_BYTE, to_server_rank, ADLB_TAG_ENUMERATE); WAIT(&request,&status); - if (res.dc == ADLB_DATA_SUCCESS) + if (result.dc == ADLB_DATA_SUCCESS) { - *records = res.records; - *length = res.length; + *records = result.records; + *length = result.length; if (include_keys || include_vals) { - *data = malloc(res.length); + *data = malloc(result.length); ADLB_CHECK_MALLOC(*data); - adlb_code ac = mpi_recv_big(*data, res.length, + adlb_code ac = mpi_recv_big(*data, result.length, to_server_rank, ADLB_TAG_RESPONSE); ADLB_CHECK(ac); } kv_type->valid = true; - kv_type->CONTAINER.key_type = res.key_type; - kv_type->CONTAINER.val_type = res.val_type; + kv_type->CONTAINER.key_type = result.key_type; + kv_type->CONTAINER.val_type = result.val_type; return ADLB_SUCCESS; } else diff --git a/lb/code/src/adlb_types.h b/lb/code/src/adlb_types.h index 702cb7533..facbdfda1 100644 --- a/lb/code/src/adlb_types.h +++ b/lb/code/src/adlb_types.h @@ -31,7 +31,8 @@ #include "table_bp.h" #include -#include +#include +#include #include #include #include diff --git a/lb/code/src/data.c b/lb/code/src/data.c index fee9ad25b..335870895 100644 --- a/lb/code/src/data.c +++ b/lb/code/src/data.c @@ -1432,14 +1432,15 @@ pack_member(adlb_container *cont, table_bp_entry *item, on the array size. */ static adlb_data_code -extract_members(adlb_container *cont, int count, int offset, +extract_members(adlb_container* container, + int count, int offset, bool include_keys, bool include_vals, - const adlb_buffer *caller_buffer, - adlb_buffer *output) + const adlb_buffer* caller_buffer, + adlb_buffer* output) { int c = 0; // Count of members seen adlb_data_code dc; - struct table_bp* members = cont->members; + struct table_bp* members = container->members; bool use_caller_buf; dc = ADLB_Init_buf(caller_buffer, output, &use_caller_buf, 65536); @@ -1459,18 +1460,18 @@ extract_members(adlb_container *cont, int count, int offset, { if (c >= count+offset && count != -1) { - TRACE("Got %i/%i items, done\n", c+1, count); + TRACE("packed c=%i / count=%i items, done", c, count); goto extract_members_done; } - dc = pack_member(cont, item, include_keys, include_vals, &tmp_buf, + dc = pack_member(container, item, include_keys, include_vals, &tmp_buf, output, &use_caller_buf, &output_pos); ADLB_DATA_CHECK_CODE(dc); } c++; } - TRACE("Got %i/%i entries at offset %i table size %i\n", c-offset, count, - offset, members->size); + TRACE("packed final %i/%i entries at offset=%i total=%i", + c-offset, count, offset, members->size); // Should have found requested number if (count != -1 && c - offset != count) { @@ -1482,7 +1483,7 @@ extract_members(adlb_container *cont, int count, int offset, extract_members_done: // Mark actual length of output output->length = output_pos; - TRACE("extract_members: output_length: %zu\n", output->length); + TRACE("output_length: %zu", output->length); return ADLB_DATA_SUCCESS; } @@ -1538,14 +1539,15 @@ enumerate_slice_size(int offset, int count, int actual_size) @param container_id @param count maximum number of elements to return, negative for unlimited @param offset offset of member to start at + @param include_keys whether to include keys in result + @param include_vals whether to include values in result @param data Filled in with output location for encoded binary keys and values. Members are stored with key first, then value. The length in bytes of the key and value is encoded with vint_encode and prefixed to the actual data - @param length Length of data in data - @param include_keys whether to include keys in result - @param include_vals whether to include values in result - @param actual Returns the number of entries in the container + @param actual The number of entries returned + @param key_type The key type for the container + @param val_type The value type for the container */ adlb_data_code xlb_data_enumerate(adlb_datum_id id, int count, int offset, @@ -1554,7 +1556,8 @@ xlb_data_enumerate(adlb_datum_id id, int count, int offset, adlb_buffer *data, int* actual, adlb_data_type *key_type, adlb_data_type *val_type) { - TRACE("data_enumerate(%"PRId64")", id); + TRACE("id=%"PRId64" keys=%i vals=%i offset=%i", + id, include_keys, include_vals, offset); adlb_datum* d; adlb_data_code dc = xlb_datum_lookup(id, &d); ADLB_DATA_CHECK_CODE(dc); @@ -1575,8 +1578,8 @@ xlb_data_enumerate(adlb_datum_id id, int count, int offset, *actual = slice_size; *key_type = (adlb_data_type)d->data.CONTAINER.key_type; *val_type = (adlb_data_type)d->data.CONTAINER.val_type; - TRACE("Enumerate container: %i elems %zu bytes\n", slice_size, - data->length); + TRACE("extracted: members=%i bytes=%zu", + slice_size, data->length); return ADLB_DATA_SUCCESS; } else if (d->type == ADLB_DATA_TYPE_MULTISET) diff --git a/lb/code/src/debug.h b/lb/code/src/debug.h index 8f0f1a7d5..717149850 100644 --- a/lb/code/src/debug.h +++ b/lb/code/src/debug.h @@ -84,10 +84,11 @@ adlb_code xlb_debug_check_environment(void); #endif #if ENABLE_LOG_TRACE && !defined(NDEBUG) -#define TRACE(format, args...) \ - { if (xlb_trace_enabled && xlb_debug_enabled) { \ - printf("%5.4f ADLB_TRACE: " format "\n", xlb_wtime(), ## args); \ - fflush(stdout); \ +#define TRACE(format, args...) \ + { if (xlb_trace_enabled && xlb_debug_enabled) { \ + printf("%5.4f ADLB_TRACE: %s(): " format "\n", \ + xlb_wtime(), __func__, ## args); \ + fflush(stdout); \ } } #else #define TRACE(format, args...) // noop diff --git a/lb/code/src/handlers.c b/lb/code/src/handlers.c index 6b09526d1..afa2e4bbd 100644 --- a/lb/code/src/handlers.c +++ b/lb/code/src/handlers.c @@ -1057,7 +1057,7 @@ static adlb_code handle_multicreate(int caller) { // MPE_LOG(xlb_mpe_svr_multicreate_start); - TRACE("ADLB_TAG_MULTICREATE\n"); + TRACE("caller=%i", caller); MPI_Status status; @@ -1086,7 +1086,7 @@ handle_multicreate(int caller) free(specs); ADLB_DATA_CHECK(dc); - TRACE("ADLB_TAG_MULTICREATE done\n"); + TRACE("done"); // MPE_LOG(xlb_mpe_svr_multicreate_end); return ADLB_SUCCESS; } @@ -1297,7 +1297,7 @@ handle_retrieve(int caller) static adlb_code handle_enumerate(int caller) { - TRACE("ENUMERATE\n"); + TRACE("..."); struct packed_enumerate opts; adlb_code rc; MPI_Status status; @@ -1305,12 +1305,12 @@ handle_enumerate(int caller) ADLB_TAG_ENUMERATE); adlb_buffer data = { .data = NULL, .length = 0 }; - struct packed_enumerate_result res; + struct packed_enumerate_result result; adlb_data_code dc; dc = xlb_data_enumerate(opts.id, opts.count, opts.offset, opts.request_subscripts, opts.request_members, - &xlb_xfer_buf, &data, &res.records, - &res.key_type, &res.val_type); + &xlb_xfer_buf, &data, &result.records, + &result.key_type, &result.val_type); bool free_data = (dc == ADLB_DATA_SUCCESS && xlb_xfer_buf.data != data.data); if (dc == ADLB_DATA_SUCCESS) @@ -1319,11 +1319,10 @@ handle_enumerate(int caller) ADLB_CHECK(rc); } - res.dc = dc; - res.length = data.length; + result.dc = dc; + result.length = data.length; - - RSEND(&res, sizeof(res), MPI_BYTE, caller, ADLB_TAG_RESPONSE); + RSEND(&result, sizeof(result), MPI_BYTE, caller, ADLB_TAG_RESPONSE); if (dc == ADLB_DATA_SUCCESS) { if (opts.request_subscripts || opts.request_members) @@ -1342,7 +1341,6 @@ handle_enumerate(int caller) static adlb_code handle_subscribe(int caller) { - TRACE("ADLB_TAG_SUBSCRIBE\n"); MPE_LOG(xlb_mpe_svr_subscribe_start); MPI_Status status; diff --git a/lb/code/src/messaging.h b/lb/code/src/messaging.h index f761c9d58..4d1105ffb 100644 --- a/lb/code/src/messaging.h +++ b/lb/code/src/messaging.h @@ -718,6 +718,7 @@ xlb_unpack_id_sub(const void *buffer, adlb_datum_id *id, static inline adlb_code mpi_send_big(const void* data, size_t length, int target, int tag) { + TRACE("length=%zi target=%i", length, target); size_t chunk = 100*1024*1024; int chunk_int = (int) chunk; size_t chunks_full = length / chunk; @@ -734,8 +735,9 @@ mpi_send_big(const void* data, size_t length, int target, int tag) } static inline adlb_code -mpi_recv_big(void* xfer, size_t length, int caller, int tag) +mpi_recv_big(void* xfer, size_t length, int sender, int tag) { + TRACE("length=%zi sender=%i", length, sender); size_t chunk = 100*1024*1024; int chunk_int = (int) chunk; size_t chunks_full = length / chunk; @@ -745,10 +747,10 @@ mpi_recv_big(void* xfer, size_t length, int caller, int tag) MPI_Status status; for (int i = 0; i < chunks_full; i++) { - RECV(p, chunk_int, MPI_BYTE, caller, tag); + RECV(p, chunk_int, MPI_BYTE, sender, tag); p += chunk; } - RECV(p, remainder_int, MPI_BYTE, caller, tag); + RECV(p, remainder_int, MPI_BYTE, sender, tag); return ADLB_SUCCESS; } diff --git a/turbine/code/src/tcl/adlb/tcl-adlb.c b/turbine/code/src/tcl/adlb/tcl-adlb.c index da1be1f51..a4cb5b4a0 100644 --- a/turbine/code/src/tcl/adlb/tcl-adlb.c +++ b/turbine/code/src/tcl/adlb/tcl-adlb.c @@ -2813,19 +2813,22 @@ adlb_datum2tclobj(Tcl_Interp *interp, Tcl_Obj *const objv[], case ADLB_DATA_TYPE_INTEGER: dc = ADLB_Unpack_integer(&tmp.INTEGER, data, length); TCL_CONDITION(dc == ADLB_DATA_SUCCESS, - "Retrieve failed due to error unpacking integer data <%"PRId64"> error code: %i", id, dc); + "Retrieve failed due to error unpacking integer data " + "<%"PRId64"> error code: %i", id, dc); *result = Tcl_NewADLBInt(tmp.INTEGER); break; case ADLB_DATA_TYPE_REF: dc = ADLB_Unpack_ref(&tmp.REF, data, length, ADLB_NO_REFC, true); TCL_CONDITION(dc == ADLB_DATA_SUCCESS, - "Retrieve failed due to error unpacking reference data <%"PRId64"> error code: %i", id, dc); + "Retrieve failed due to error unpacking reference data " + "<%"PRId64"> error code: %i", id, dc); *result = Tcl_NewADLB_ID(tmp.REF.id); break; case ADLB_DATA_TYPE_FLOAT: dc = ADLB_Unpack_float(&tmp.FLOAT, data, length); TCL_CONDITION(dc == ADLB_DATA_SUCCESS, - "Retrieve failed due to error unpacking float data <%"PRId64"> error code: %i", id, dc); + "Retrieve failed due to error unpacking float data " + "<%"PRId64"> error code: %i", id, dc); *result = Tcl_NewDoubleObj(tmp.FLOAT); break; case ADLB_DATA_TYPE_STRING: @@ -2835,7 +2838,8 @@ adlb_datum2tclobj(Tcl_Interp *interp, Tcl_Obj *const objv[], dc = ADLB_Unpack_string(&tmp.STRING, (void*)data, length, false); TCL_CONDITION(dc == ADLB_DATA_SUCCESS, - "Retrieve failed due to error unpacking string data <%"PRId64"> length: %zi error code: %i", id, length, dc); + "Retrieve failed due to error unpacking string data " + "<%"PRId64"> length: %zi error code: %i", id, length, dc); *result = Tcl_NewStringObj(tmp.STRING.value, (int) (tmp.STRING.length-1)); break;