This is supposed to work in a loop (server) and delegate work/inquiry to a faulty library, here represented by the longrun() function call, to a thread with a time-out of tmax=3s. I placed synchronization vars and i am trying to wait for no more than this limit, but when longrun() hangs (run 4), it still waits the full time (7s) instead of the requested limit. Can anyone explain?
#include <unistd.h> #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <pthread.h> #include <sys/time.h> #include <iostream> using namespace std; string int2str(int i){ char buf[10]; // no larger int passed we hope int end = sprintf(buf, "%d", i); buf[end] = ''; return string(buf); } string longrun(int qi){ if(qi % 4 == 0) { sleep(7); return string("'---- to: ") + int2str(qi) + string("' (hang case)"); } else { sleep(1); return string("'okay to: ") + int2str(qi) + string("'"); } } struct tpack_t { // thread pack pthread_t thread; pthread_mutex_t mutex; pthread_cond_t go; // have a new value to run pthread_cond_t ready; // tell main thread we're done processing int newq; // predicate on go+ready condition for wait int qi; // place question as int to thread: question-int string res; // where i place the response tpack_t(); }; tpack_t::tpack_t() { pthread_mutex_init (&mutex, NULL); pthread_cond_init (&go, NULL); pthread_cond_init (&ready, NULL); newq = 0; } void set_cond_time(timespec* ctp, int tmax){ timeval now; gettimeofday(&now, NULL); ctp->tv_nsec = now.tv_usec * 1000UL; ctp->tv_sec = now.tv_sec + tmax; // now + max time! printf("[m] ... set to sleep for %d sec, i hope...n", tmax); } void take_faulty_evasive_action(tpack_t* tpx){ // basically kill thread, clean faulty library copy (that file) and restart it cout << "will work on it (restarting thread) soon!n"; tpx->newq = 0; // minimal action for now... } void* faulty_proc(void* arg){ tpack_t* tpx = (tpack_t*) arg; while(true){ pthread_mutex_lock(&tpx->mutex); while(tpx->newq == 0){ pthread_cond_wait(&tpx->go, &tpx->mutex); } printf("[t] to process : %dn", tpx->qi); fflush(stdout); // now i have a new value in qi, process it and place the answer in... res tpx->res = longrun(tpx->qi); tpx->newq = 0; pthread_mutex_unlock(&tpx->mutex); pthread_cond_signal(&tpx->ready); } } int main(int argc, char* argv[]){ cout << "n this presents the problem: idx = 4k -> hang case ...n ( challenge is to eliminate them by killing thread and restarting it )nn"; printf(" ETIMEDOUT = %d EINVAL = %d EPERM = %dnn", ETIMEDOUT, EINVAL, EPERM); tpack_t* tpx = new tpack_t(); pthread_create(&tpx->thread, NULL, &faulty_proc, (void*) tpx); // max wait time; more than that is a hanging indication! int numproc = 5; ++numproc; int tmax = 3; timespec cond_time; cond_time.tv_nsec = 0; int status, expired; // for timed wait on done condition! time_t t0 = time(NULL); for(int i=1; i<numproc; ++i){ expired = 0; pthread_mutex_lock(&tpx->mutex); tpx->qi = i; // init the question tpx->newq = 1; // ... predicate //pthread_mutex_unlock(&tpx->mutex); pthread_cond_signal(&tpx->go); // let it know that... while(tpx->newq == 1){ /// ---------------------- most amazing region, timedwait waits all the way! ---------------------- set_cond_time(&cond_time, tmax); // time must be FROM NOW! (abs time, not interval) time_t wt0 = time(NULL); status = pthread_cond_timedwait(&tpx->ready, &tpx->mutex, &cond_time); printf("[m] ---- t exited with status = %d (after %.2fs)n", status, difftime(time(NULL), wt0)); /// ----------------------------------------------------------------------------------------------- if (status == ETIMEDOUT){ printf("t ['t was and newq == %d]n", tpx->newq); if(tpx->newq == 1){ // check one more time, to elim race possibility expired = 1; break; } } else if(status != 0){ fprintf(stderr, "cond timewait for faulty to reply errored outn"); return 1; } } if(expired){ take_faulty_evasive_action(tpx); // kill thread, start new one, report failure below cout << "[m] :: interruption: default bad answer goes here for " << i << "nn"; } else { cout << "[m] :: end with ans: " << tpx->res << endl << endl; } pthread_mutex_unlock(&tpx->mutex); } time_t t1 = time(NULL); printf("took %.2f sec to runn", difftime(t1, t0)); }
Used ‘g++ -pthread code.cc’ to compile under linux (ubuntu 16.04). Output is:
this presents the problem: idx = 4k -> hang case ... ( challenge is to eliminate them by killing thread and restarting it ) ETIMEDOUT = 110 EINVAL = 22 EPERM = 1 [m] ... set to sleep for 3 sec, i hope... [t] to process : 1 [m] ---- exited with status = 0 (after 1.00s) [m] :: end with ans: 'okay to: 1' [m] ... set to sleep for 3 sec, i hope... [t] to process : 2 [m] ---- exited with status = 0 (after 1.00s) [m] :: end with ans: 'okay to: 2' [m] ... set to sleep for 3 sec, i hope... [t] to process : 3 [m] ---- exited with status = 0 (after 1.00s) [m] :: end with ans: 'okay to: 3' [m] ... set to sleep for 3 sec, i hope... [t] to process : 4 [m] ---- exited with status = 110 (after 7.00s) ['t was and newq == 0] [m] :: end with ans: '---- to: 4' (hang case) [m] ... set to sleep for 3 sec, i hope... [t] to process : 5 [m] ---- exited with status = 0 (after 1.00s) [m] :: end with ans: 'okay to: 5' took 11.00 sec to run
Advertisement
Answer
The problem is that faulty_proc()
keeps tpx->mutex
locked while it calls longrun()
, and the pthread_cond_timedwait()
call in main()
can’t return until it can re-acquire the mutex, even if the timeout expires.
If longrun()
doesn’t need the mutex to be locked – and that seems to be the case – you can unlock the mutex around that call and re-lock it before setting the completion flag and signalling the condition variable.