GNSS-SDR 0.0.21
An Open Source GNSS Software Defined Receiver
Loading...
Searching...
No Matches
fft_internal.h
Go to the documentation of this file.
1/*!
2 * \file fft_internal.h
3 * \brief Internals of FFT for OpenCL
4 *
5 *
6 * Version: <1.0>
7 *
8 * Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
9 * SPDX-License-Identifier: LicenseRef-Apple-Permissive
10 *
11 *
12 */
13
14
15#ifndef __CLFFT_INTERNAL_H
16#define __CLFFT_INTERNAL_H
17
18#include "clFFT.h"
19#include <iostream>
20#include <sstream>
21#include <string>
22
23using namespace std;
24
25typedef enum kernel_dir_t
26{
27 cl_fft_kernel_x,
28 cl_fft_kernel_y,
29 cl_fft_kernel_z
30} cl_fft_kernel_dir;
31
32typedef struct kernel_info_t
33{
34 cl_kernel kernel;
35 char *kernel_name;
36 unsigned lmem_size;
37 unsigned num_workgroups;
38 unsigned num_xforms_per_workgroup;
39 unsigned num_workitems_per_workgroup;
40 cl_fft_kernel_dir dir;
41 int in_place_possible;
42 kernel_info_t *next;
43} cl_fft_kernel_info;
44
45typedef struct
46{
47 // context in which fft resources are created and kernels are executed
48 cl_context context;
49
50 // size of signal
51 clFFT_Dim3 n;
52
53 // dimension of transform ... must be either 1D, 2D or 3D
54 clFFT_Dimension dim;
55
56 // data format ... must be either interleaved or plannar
57 clFFT_DataFormat format;
58
59 // string containing kernel source. Generated at runtime based on
60 // n, dim, format and other parameters
61 string *kernel_string;
62
63 // CL program containing source and kernel this particular
64 // n, dim, data format
65 cl_program program;
66
67 // linked list of kernels which needs to be executed for this fft
68 cl_fft_kernel_info *kernel_info;
69
70 // number of kernels
71 int num_kernels;
72
73 // twist kernel for virtualizing fft of very large sizes that do not
74 // fit in GPU global memory
75 cl_kernel twist_kernel;
76
77 // flag indicating if temporary intermediate buffer is needed or not.
78 // this depends on fft kernels being executed and if transform is
79 // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
80 // one that does not require global transpose do not need temporary buffer)
81 // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
82 // If temp buffer is needed, its allocation is lazy i.e. its not allocated
83 // until its needed
84 cl_int temp_buffer_needed;
85
86 // Batch size is runtime parameter and size of temporary buffer (if needed)
87 // depends on batch size. Allocation of temporary buffer is lazy i.e. its
88 // only created when needed. Once its created at first call of clFFT_Executexxx
89 // it is not allocated next time if next time clFFT_Executexxx is called with
90 // batch size different than the first call. last_batch_size caches the last
91 // batch size with which this plan is used so that we dont keep allocating/deallocating
92 // temp buffer if same batch size is used again and again.
93 unsigned last_batch_size;
94
95 // temporary buffer for interleaved plan
96 cl_mem tempmemobj;
97
98 // temporary buffer for planner plan. Only one of tempmemobj or
99 // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
100 // data format of plan (plannar or interleaved)
101 cl_mem tempmemobj_real, tempmemobj_imag;
102
103 // Maximum size of signal for which local memory transposed based
104 // fft is sufficient i.e. no global mem transpose (communication)
105 // is needed
106 unsigned max_localmem_fft_size;
107
108 // Maximum work items per work group allowed. This, along with max_radix below controls
109 // maximum local memory being used by fft kernels of this plan. Set to 256 by default
110 unsigned max_work_item_per_workgroup;
111
112 // Maximum base radix for local memory fft ... this controls the maximum register
113 // space used by work items. Currently defaults to 16
114 unsigned max_radix;
115
116 // Device depended parameter that tells how many work-items need to be read consecutive
117 // values to make sure global memory access by work-items of a work-group result in
118 // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
119 unsigned min_mem_coalesce_width;
120
121 // Number of local memory banks. This is used to geneate kernel with local memory
122 // transposes with appropriate padding to avoid bank conflicts to local memory
123 // e.g. on NVidia it is 16.
124 unsigned num_local_mem_banks;
126
127void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
128
129#endif
FFT in OpenCL.
STL namespace.