研究文章

许多核心加速器芯片缓存Locality-Centric并行字符串匹配

伪代码2

伪代码使用多个CUDA流来实现我们的方法。
( )
( )cudaArray * cuda_arrays =malloc (cudaArray * *)(num_of_stts*sizeof (cudaArray *))
( )cudaTextureObject_t * textObj = (cudaTextureObject_t *)malloc (num_of_stts *运算符
(cudaTextureObject_t));
( )(int i = 0;我< num_of_stts;我+ +)
( )cuda_arrays[我]= generate_cuda_array(get_dfa_matrix(我),get_dfa_width(我),
get_dfa_height (i));
( )textObj[我]= generate_texture_objects(cuda_arrays[我]);
( )
( )
( )cudaStream_t*流=
( )(cudaStream_t *) malloc(nstreams * sizeof (cudaStream_t));
( ) 创建多个CUDA流
( )(int i = 0;我< nstreams;我+ +)
( )cudaStreamCreate[我])(&(流);
( )
( )/ /将数据复制到GPU内存,每个流拷贝一个段
( )(int i = 0;我< nstreams;我+ +)
( )长in_offset =我* input_len / nstreams;
( )cudaMemcpyAsync(d_input + in_offseth_input + in_offset,input_len *运算符
(字符)/ nstreams,cudaMemcpyHostToDevice,流[我]);
( )
( ) 每个流处理输入数据与每个dfa (texObj[我])
( )for (int i = 0;我< nstreams;我+ +)
( )匹配< < <块,线程,sm_size,流[我]> > > (texObj[我],d_input,
input_len,pattern_max_len,d_output,output_len);
( )
( ) 复制结果返回给主机CPU,每个流拷贝一个段
( )(int i = 0;我< nstreams;我+ +)
( )长out_offset =我* output_len / nstreams;
( )cudaMemcpyAsync(h_output + out_offsetd_output + out_offset,output_len *运算符
(int) / nstreams,cudaMemcpyDeviceToHost);
( )
( )(int i = 0;我< nstreams;我+ +)
( )cudaStreamDestroy(流[我]);
( )
( )