multithreading - cudaStreamSynchronize behavior under multiple threads -
what behavior of cudastreamsynchronize in following case
threada pseudo code : while(true): submit new cuda kernel cudastreamx threadb pseudo code: call cudastreamsynchronize(cudastreamx)
my question when threadb return? since threada push new cuda kernels, , cudastreamx never finish.
the api documentation isn't directly explicit this, cuda c programming guide explicit:
cudastreamsynchronize()
takes stream parameter , waits until all preceding commands in given stream have completed
furthermore, think should sensible that:
cudastreamsynchronize()
cannot reasonably take account work issued stream aftercudastreamsynchronize()
call. more or less require know future.cudastreamsynchronize()
should reasonably expected return after previously issued work stream complete.
putting experimental test app, above description observe:
$ cat t396.cu #include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <iostream> #include <unistd.h> const int pthreads=2; const int trigger1=5; #define cudacheckerrors(msg) \ { \ cudaerror_t __err = cudagetlasterror(); \ if (__err != cudasuccess) { \ fprintf(stderr, "fatal error: %s (%s @ %s:%d)\n", \ msg, cudageterrorstring(__err), \ __file__, __line__); \ fprintf(stderr, "*** failed - aborting\n"); \ exit(1); \ } \ } while (0) #include <time.h> #include <sys/time.h> #define usecpsec 1000000ull long long dtime_usec(unsigned long long start){ timeval tv; gettimeofday(&tv, 0); return ((tv.tv_sec*usecpsec)+tv.tv_usec)-start; } #define delay_t 1000000000ull template <int type> __global__ void delay_kern(int i){ unsigned long long time = clock64(); #ifdef debug printf("hello %d\n", type); #endif while (clock64() < time+(i*delay_t)); } volatile static int flag, flag0, loop_cnt; // thread configuration structure. typedef struct { int my_thread_ordinal; pthread_t thread; cudaerror_t status; cudastream_t stream; int delay_usec; } config_t; // function executed each thread assigned cuda device. void *thread_func(void *arg) { // unpack config structure. config_t *config = (config_t *)arg; int my_thread=config->my_thread_ordinal; cudaerror_t cuda_status = cudasuccess; cuda_status = cudasetdevice(0); if (cuda_status != cudasuccess) { fprintf(stderr, "cannot set focus device %d, status = %d\n", 0, cuda_status); config->status = cuda_status; pthread_exit(null); } printf("thread %d initialized\n", my_thread); switch(config->my_thread_ordinal){ case 0: //master thread while (flag0) { delay_kern<0><<<1,1,0,config->stream>>>(1); if (loop_cnt++ > trigger1) flag = 1; printf("master thread loop: %d\n", loop_cnt); usleep(config->delay_usec); } break; default: //slave thread while (!flag); printf("slave thread issuing stream sync @ loop count: %d\n", loop_cnt); cudastreamsynchronize(config->stream); flag0 = 0; printf("slave thread set trigger , exit\n"); break; } cudacheckerrors("thread cuda error"); printf("thread %d complete\n", my_thread); config->status = cudasuccess; return null; } int main(int argc, char* argv[]) { int mydelay_usec = 1; if (argc > 1) mydelay_usec = atoi(argv[1]); if ((mydelay_usec < 1) || (mydelay_usec > 10000000)) {printf("invalid delay time specified\n"); return -1;} flag = 0; flag0 = 1; loop_cnt = 0; const int nthreads = pthreads; // create workers configs. data passed // argument thread_func. config_t* configs = (config_t*)malloc(sizeof(config_t) * nthreads); cudasetdevice(0); cudastream_t str; cudastreamcreate(&str); // create separate thread // , execute thread_func. (int = 0; < nthreads; i++) { config_t *config = configs + i; config->my_thread_ordinal = i; config->stream = str; config->delay_usec = mydelay_usec; int status = pthread_create(&config->thread, null, thread_func, config); if (status) { fprintf(stderr, "cannot create thread device %d, status = %d\n", i, status); } } // wait device threads completion. // check error status. int status = 0; (int = 0; < nthreads; i++) { pthread_join(configs[i].thread, null); status += configs[i].status; } if (status) return status; free(configs); return 0; } $ nvcc -arch=sm_61 -o t396 t396.cu -lpthread $ time ./t396 100000 thread 0 initialized thread 1 initialized master thread loop: 1 master thread loop: 2 master thread loop: 3 master thread loop: 4 master thread loop: 5 master thread loop: 6 slave thread issuing stream sync @ loop count: 7 master thread loop: 7 master thread loop: 8 master thread loop: 9 master thread loop: 10 master thread loop: 11 master thread loop: 12 master thread loop: 13 master thread loop: 14 master thread loop: 15 master thread loop: 16 master thread loop: 17 master thread loop: 18 master thread loop: 19 master thread loop: 20 master thread loop: 21 master thread loop: 22 master thread loop: 23 master thread loop: 24 master thread loop: 25 master thread loop: 26 master thread loop: 27 master thread loop: 28 master thread loop: 29 master thread loop: 30 master thread loop: 31 master thread loop: 32 master thread loop: 33 master thread loop: 34 master thread loop: 35 master thread loop: 36 master thread loop: 37 master thread loop: 38 master thread loop: 39 slave thread set trigger , exit thread 1 complete thread 0 complete real 0m5.416s user 0m2.990s sys 0m1.623s $
this require careful thought understand. however, in nutshell, app issue kernels execute 0.7s delay before returning 1 thread, , other thread wait small number of kernels issued, issue cudastreamsynchronize()
call. overall time measurement application defines when call returned. long keep command line parameter (host delay) between kernel launches value less 0.5s, app reliably exit in 5.4s (this vary depending on gpu running on, overall app execution time should constant large value of host delay parameter).
if specify command line parameter larger kernel duration on machine, overall app execution time approximately 5 times command line parameter (microseconds), since trigger point cudastreamsynchronize()
call 5.
in case, compiled , ran on cuda 8.0.61, ubuntu 14.04, pascal titan x.
Comments
Post a Comment