GNSS-SDR  0.0.19
An Open Source GNSS Software Defined Receiver
fft_internal.h
Go to the documentation of this file.
1 /*!
2  * \file fft_internal.h
3  * \brief Internals of FFT for OpenCL
4  *
5  *
6  * Version: <1.0>
7  *
8  * Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
9  * SPDX-License-Identifier: LicenseRef-Apple-Permissive
10  *
11  *
12  */
13 
14 
15 #ifndef __CLFFT_INTERNAL_H
16 #define __CLFFT_INTERNAL_H
17 
18 #include "clFFT.h"
19 #include <iostream>
20 #include <sstream>
21 #include <string>
22 
23 using namespace std;
24 
25 typedef enum kernel_dir_t
26 {
27  cl_fft_kernel_x,
28  cl_fft_kernel_y,
29  cl_fft_kernel_z
30 } cl_fft_kernel_dir;
31 
32 typedef struct kernel_info_t
33 {
34  cl_kernel kernel;
35  char *kernel_name;
36  unsigned lmem_size;
37  unsigned num_workgroups;
38  unsigned num_xforms_per_workgroup;
39  unsigned num_workitems_per_workgroup;
40  cl_fft_kernel_dir dir;
41  int in_place_possible;
42  kernel_info_t *next;
44 
45 typedef struct
46 {
47  // context in which fft resources are created and kernels are executed
48  cl_context context;
49 
50  // size of signal
51  clFFT_Dim3 n;
52 
53  // dimension of transform ... must be either 1D, 2D or 3D
54  clFFT_Dimension dim;
55 
56  // data format ... must be either interleaved or plannar
57  clFFT_DataFormat format;
58 
59  // string containing kernel source. Generated at runtime based on
60  // n, dim, format and other parameters
61  string *kernel_string;
62 
63  // CL program containing source and kernel this particular
64  // n, dim, data format
65  cl_program program;
66 
67  // linked list of kernels which needs to be executed for this fft
68  cl_fft_kernel_info *kernel_info;
69 
70  // number of kernels
71  int num_kernels;
72 
73  // twist kernel for virtualizing fft of very large sizes that do not
74  // fit in GPU global memory
75  cl_kernel twist_kernel;
76 
77  // flag indicating if temporary intermediate buffer is needed or not.
78  // this depends on fft kernels being executed and if transform is
79  // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
80  // one that does not require global transpose do not need temporary buffer)
81  // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
82  // If temp buffer is needed, its allocation is lazy i.e. its not allocated
83  // until its needed
84  cl_int temp_buffer_needed;
85 
86  // Batch size is runtime parameter and size of temporary buffer (if needed)
87  // depends on batch size. Allocation of temporary buffer is lazy i.e. its
88  // only created when needed. Once its created at first call of clFFT_Executexxx
89  // it is not allocated next time if next time clFFT_Executexxx is called with
90  // batch size different than the first call. last_batch_size caches the last
91  // batch size with which this plan is used so that we dont keep allocating/deallocating
92  // temp buffer if same batch size is used again and again.
93  unsigned last_batch_size;
94 
95  // temporary buffer for interleaved plan
96  cl_mem tempmemobj;
97 
98  // temporary buffer for planner plan. Only one of tempmemobj or
99  // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
100  // data format of plan (plannar or interleaved)
101  cl_mem tempmemobj_real, tempmemobj_imag;
102 
103  // Maximum size of signal for which local memory transposed based
104  // fft is sufficient i.e. no global mem transpose (communication)
105  // is needed
106  unsigned max_localmem_fft_size;
107 
108  // Maximum work items per work group allowed. This, along with max_radix below controls
109  // maximum local memory being used by fft kernels of this plan. Set to 256 by default
110  unsigned max_work_item_per_workgroup;
111 
112  // Maximum base radix for local memory fft ... this controls the maximum register
113  // space used by work items. Currently defaults to 16
114  unsigned max_radix;
115 
116  // Device depended parameter that tells how many work-items need to be read consecutive
117  // values to make sure global memory access by work-items of a work-group result in
118  // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
119  unsigned min_mem_coalesce_width;
120 
121  // Number of local memory banks. This is used to geneate kernel with local memory
122  // transposes with appropriate padding to avoid bank conflicts to local memory
123  // e.g. on NVidia it is 16.
124  unsigned num_local_mem_banks;
125 } cl_fft_plan;
126 
127 void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
128 
129 #endif
FFT in OpenCL.
STL namespace.