| ()⋯ |
| ()cudaArray * cuda_arrays =malloc (cudaArray * *)(num_of_stts*sizeof (cudaArray *)) |
| ()cudaTextureObject_t * textObj = (cudaTextureObject_t *)malloc (num_of_stts *运算符 |
| (cudaTextureObject_t)); |
| ()为(int i = 0;我< num_of_stts;我+ +)
|
| ()cuda_arrays[我]= generate_cuda_array(get_dfa_matrix(我),get_dfa_width(我), |
| get_dfa_height (i)); |
| ()textObj[我]= generate_texture_objects(cuda_arrays[我]); |
| () |
| ()⋯ |
| ()cudaStream_t*流= |
| ()(cudaStream_t *) malloc(nstreams * sizeof (cudaStream_t)); |
| ()创建多个CUDA流
|
| ()为(int i = 0;我< nstreams;我+ +)
|
| ()cudaStreamCreate[我])(&(流); |
| () |
| ()/ /将数据复制到GPU内存,每个流拷贝一个段 |
| ()为(int i = 0;我< nstreams;我+ +)
|
| ()长in_offset =我* input_len / nstreams; |
| ()cudaMemcpyAsync(d_input + in_offseth_input + in_offset,input_len *运算符 |
| (字符)/ nstreams,cudaMemcpyHostToDevice,流[我]); |
| () |
| ()每个流处理输入数据与每个dfa (texObj[我])
|
| ()for (int i = 0;我< nstreams;我+ +)
|
| ()匹配< < <块,线程,sm_size,流[我]> > > (texObj[我],d_input, |
| input_len,pattern_max_len,d_output,output_len); |
| () |
| ()复制结果返回给主机CPU,每个流拷贝一个段
|
| ()为(int i = 0;我< nstreams;我+ +)
|
| ()长out_offset =我* output_len / nstreams; |
| ()cudaMemcpyAsync(h_output + out_offsetd_output + out_offset,output_len *运算符 |
| (int) / nstreams,cudaMemcpyDeviceToHost); |
| () |
| ()为(int i = 0;我< nstreams;我+ +)
|
| ()cudaStreamDestroy(流[我]); |
| () |
| ()⋯ |