|
tesseract 3.04.01
|
00001 /****************************************************************************** 00002 ** Filename: cluster.c 00003 ** Purpose: Routines for clustering points in N-D space 00004 ** Author: Dan Johnson 00005 ** History: 5/29/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 #include "const.h" 00019 #include "cluster.h" 00020 #include "emalloc.h" 00021 #include "genericheap.h" 00022 #include "helpers.h" 00023 #include "kdpair.h" 00024 #include "matrix.h" 00025 #include "tprintf.h" 00026 #include "danerror.h" 00027 #include "freelist.h" 00028 #include <math.h> 00029 00030 #define HOTELLING 1 // If true use Hotelling's test to decide where to split. 00031 #define FTABLE_X 10 // Size of FTable. 00032 #define FTABLE_Y 100 // Size of FTable. 00033 00034 // Table of values approximating the cumulative F-distribution for a confidence of 1%. 00035 const double FTable[FTABLE_Y][FTABLE_X] = { 00036 {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,}, 00037 {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,}, 00038 {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,}, 00039 {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,}, 00040 {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,}, 00041 {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,}, 00042 {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,}, 00043 {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,}, 00044 {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,}, 00045 {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,}, 00046 { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,}, 00047 { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,}, 00048 { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,}, 00049 { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,}, 00050 { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,}, 00051 { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,}, 00052 { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,}, 00053 { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,}, 00054 { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,}, 00055 { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,}, 00056 { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,}, 00057 { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,}, 00058 { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,}, 00059 { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,}, 00060 { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,}, 00061 { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,}, 00062 { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,}, 00063 { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,}, 00064 { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,}, 00065 { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,}, 00066 { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,}, 00067 { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,}, 00068 { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,}, 00069 { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,}, 00070 { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,}, 00071 { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,}, 00072 { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,}, 00073 { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,}, 00074 { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,}, 00075 { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,}, 00076 { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,}, 00077 { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,}, 00078 { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,}, 00079 { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,}, 00080 { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,}, 00081 { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,}, 00082 { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,}, 00083 { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,}, 00084 { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,}, 00085 { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,}, 00086 { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,}, 00087 { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,}, 00088 { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,}, 00089 { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,}, 00090 { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,}, 00091 { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,}, 00092 { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,}, 00093 { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,}, 00094 { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,}, 00095 { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,}, 00096 { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,}, 00097 { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,}, 00098 { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,}, 00099 { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,}, 00100 { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,}, 00101 { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,}, 00102 { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,}, 00103 { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,}, 00104 { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,}, 00105 { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,}, 00106 { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,}, 00107 { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,}, 00108 { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,}, 00109 { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,}, 00110 { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,}, 00111 { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,}, 00112 { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,}, 00113 { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,}, 00114 { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,}, 00115 { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,}, 00116 { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,}, 00117 { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,}, 00118 { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,}, 00119 { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,}, 00120 { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,}, 00121 { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,}, 00122 { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,}, 00123 { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,}, 00124 { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,}, 00125 { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,}, 00126 { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,}, 00127 { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,}, 00128 { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,}, 00129 { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,}, 00130 { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,}, 00131 { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,}, 00132 { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,}, 00133 { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,}, 00134 { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,}, 00135 { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503} 00136 }; 00137 00142 #define MINVARIANCE 0.0004 00143 00150 #define MINSAMPLESPERBUCKET 5 00151 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) 00152 #define MINSAMPLESNEEDED 1 00153 00160 #define BUCKETTABLESIZE 1024 00161 #define NORMALEXTENT 3.0 00162 00163 struct TEMPCLUSTER { 00164 CLUSTER *Cluster; 00165 CLUSTER *Neighbor; 00166 }; 00167 00168 typedef tesseract::KDPairInc<float, TEMPCLUSTER*> ClusterPair; 00169 typedef tesseract::GenericHeap<ClusterPair> ClusterHeap; 00170 00171 struct STATISTICS { 00172 FLOAT32 AvgVariance; 00173 FLOAT32 *CoVariance; 00174 FLOAT32 *Min; // largest negative distance from the mean 00175 FLOAT32 *Max; // largest positive distance from the mean 00176 }; 00177 00178 struct BUCKETS { 00179 DISTRIBUTION Distribution; // distribution being tested for 00180 uinT32 SampleCount; // # of samples in histogram 00181 FLOAT64 Confidence; // confidence level of test 00182 FLOAT64 ChiSquared; // test threshold 00183 uinT16 NumberOfBuckets; // number of cells in histogram 00184 uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets 00185 uinT32 *Count; // frequency of occurrence histogram 00186 FLOAT32 *ExpectedCount; // expected histogram 00187 }; 00188 00189 struct CHISTRUCT{ 00190 uinT16 DegreesOfFreedom; 00191 FLOAT64 Alpha; 00192 FLOAT64 ChiSquared; 00193 }; 00194 00195 // For use with KDWalk / MakePotentialClusters 00196 struct ClusteringContext { 00197 ClusterHeap *heap; // heap used to hold temp clusters, "best" on top 00198 TEMPCLUSTER *candidates; // array of potential clusters 00199 KDTREE *tree; // kd-tree to be searched for neighbors 00200 inT32 next; // next candidate to be used 00201 }; 00202 00203 typedef FLOAT64 (*DENSITYFUNC) (inT32); 00204 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double); 00205 00206 #define Odd(N) ((N)%2) 00207 #define Mirror(N,R) ((R) - (N) - 1) 00208 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) ) 00209 00210 //--------------Global Data Definitions and Declarations---------------------- 00218 #define SqrtOf2Pi 2.506628275 00219 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT); 00220 static const FLOAT64 kNormalVariance = 00221 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT); 00222 static const FLOAT64 kNormalMagnitude = 00223 (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE); 00224 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2; 00225 00228 #define LOOKUPTABLESIZE 8 00229 #define MAXDEGREESOFFREEDOM MAXBUCKETS 00230 00231 static const uinT32 kCountTable[LOOKUPTABLESIZE] = { 00232 MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000 00233 }; // number of samples 00234 00235 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = { 00236 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS 00237 }; // number of buckets 00238 00239 /*------------------------------------------------------------------------- 00240 Private Function Prototypes 00241 --------------------------------------------------------------------------*/ 00242 void CreateClusterTree(CLUSTERER *Clusterer); 00243 00244 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, 00245 inT32 Level); 00246 00247 CLUSTER *FindNearestNeighbor(KDTREE *Tree, 00248 CLUSTER *Cluster, 00249 FLOAT32 *Distance); 00250 00251 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster); 00252 00253 inT32 MergeClusters (inT16 N, 00254 register PARAM_DESC ParamDesc[], 00255 register inT32 n1, 00256 register inT32 n2, 00257 register FLOAT32 m[], 00258 register FLOAT32 m1[], register FLOAT32 m2[]); 00259 00260 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 00261 00262 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, 00263 CLUSTERCONFIG *Config, 00264 CLUSTER *Cluster); 00265 00266 PROTOTYPE *MakeDegenerateProto(uinT16 N, 00267 CLUSTER *Cluster, 00268 STATISTICS *Statistics, 00269 PROTOSTYLE Style, 00270 inT32 MinSamples); 00271 00272 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, 00273 CLUSTERCONFIG *Config, 00274 CLUSTER *Cluster, 00275 STATISTICS *Statistics); 00276 00277 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, 00278 CLUSTER *Cluster, 00279 STATISTICS *Statistics, 00280 BUCKETS *Buckets); 00281 00282 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, 00283 CLUSTER *Cluster, 00284 STATISTICS *Statistics, 00285 BUCKETS *Buckets); 00286 00287 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, 00288 CLUSTER *Cluster, 00289 STATISTICS *Statistics, 00290 BUCKETS *NormalBuckets, 00291 FLOAT64 Confidence); 00292 00293 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc); 00294 00295 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics); 00296 00297 STATISTICS *ComputeStatistics (inT16 N, 00298 PARAM_DESC ParamDesc[], CLUSTER * Cluster); 00299 00300 PROTOTYPE *NewSphericalProto(uinT16 N, 00301 CLUSTER *Cluster, 00302 STATISTICS *Statistics); 00303 00304 PROTOTYPE *NewEllipticalProto(inT16 N, 00305 CLUSTER *Cluster, 00306 STATISTICS *Statistics); 00307 00308 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics); 00309 00310 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster); 00311 00312 BOOL8 Independent (PARAM_DESC ParamDesc[], 00313 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence); 00314 00315 BUCKETS *GetBuckets(CLUSTERER* clusterer, 00316 DISTRIBUTION Distribution, 00317 uinT32 SampleCount, 00318 FLOAT64 Confidence); 00319 00320 BUCKETS *MakeBuckets(DISTRIBUTION Distribution, 00321 uinT32 SampleCount, 00322 FLOAT64 Confidence); 00323 00324 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount); 00325 00326 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha); 00327 00328 FLOAT64 NormalDensity(inT32 x); 00329 00330 FLOAT64 UniformDensity(inT32 x); 00331 00332 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx); 00333 00334 void FillBuckets(BUCKETS *Buckets, 00335 CLUSTER *Cluster, 00336 uinT16 Dim, 00337 PARAM_DESC *ParamDesc, 00338 FLOAT32 Mean, 00339 FLOAT32 StdDev); 00340 00341 uinT16 NormalBucket(PARAM_DESC *ParamDesc, 00342 FLOAT32 x, 00343 FLOAT32 Mean, 00344 FLOAT32 StdDev); 00345 00346 uinT16 UniformBucket(PARAM_DESC *ParamDesc, 00347 FLOAT32 x, 00348 FLOAT32 Mean, 00349 FLOAT32 StdDev); 00350 00351 BOOL8 DistributionOK(BUCKETS *Buckets); 00352 00353 void FreeStatistics(STATISTICS *Statistics); 00354 00355 void FreeBuckets(BUCKETS *Buckets); 00356 00357 void FreeCluster(CLUSTER *Cluster); 00358 00359 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets); 00360 00361 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram, 00362 void *arg2); // uinT16 *DesiredNumberOfBuckets); 00363 00364 int ListEntryMatch(void *arg1, void *arg2); 00365 00366 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount); 00367 00368 void InitBuckets(BUCKETS *Buckets); 00369 00370 int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, 00371 void *arg2); // CHISTRUCT *SearchKey); 00372 00373 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha); 00374 00375 FLOAT64 Solve(SOLVEFUNC Function, 00376 void *FunctionParams, 00377 FLOAT64 InitialGuess, 00378 FLOAT64 Accuracy); 00379 00380 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x); 00381 00382 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, 00383 CLUSTER *Cluster, 00384 FLOAT32 MaxIllegal); 00385 00386 double InvertMatrix(const float* input, int size, float* inv); 00387 00388 //--------------------------Public Code-------------------------------------- 00399 CLUSTERER * 00400 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) { 00401 CLUSTERER *Clusterer; 00402 int i; 00403 00404 // allocate main clusterer data structure and init simple fields 00405 Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER)); 00406 Clusterer->SampleSize = SampleSize; 00407 Clusterer->NumberOfSamples = 0; 00408 Clusterer->NumChar = 0; 00409 00410 // init fields which will not be used initially 00411 Clusterer->Root = NULL; 00412 Clusterer->ProtoList = NIL_LIST; 00413 00414 // maintain a copy of param descriptors in the clusterer data structure 00415 Clusterer->ParamDesc = 00416 (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC)); 00417 for (i = 0; i < SampleSize; i++) { 00418 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular; 00419 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential; 00420 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min; 00421 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max; 00422 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; 00423 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2; 00424 Clusterer->ParamDesc[i].MidRange = 00425 (ParamDesc[i].Max + ParamDesc[i].Min) / 2; 00426 } 00427 00428 // allocate a kd tree to hold the samples 00429 Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc); 00430 00431 // Initialize cache of histogram buckets to minimize recomputing them. 00432 for (int d = 0; d < DISTRIBUTION_COUNT; ++d) { 00433 for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c) 00434 Clusterer->bucket_cache[d][c] = NULL; 00435 } 00436 00437 return Clusterer; 00438 } // MakeClusterer 00439 00440 00457 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, 00458 inT32 CharID) { 00459 SAMPLE *Sample; 00460 int i; 00461 00462 // see if the samples have already been clustered - if so trap an error 00463 if (Clusterer->Root != NULL) 00464 DoError (ALREADYCLUSTERED, 00465 "Can't add samples after they have been clustered"); 00466 00467 // allocate the new sample and initialize it 00468 Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) + 00469 (Clusterer->SampleSize - 00470 1) * sizeof (FLOAT32)); 00471 Sample->Clustered = FALSE; 00472 Sample->Prototype = FALSE; 00473 Sample->SampleCount = 1; 00474 Sample->Left = NULL; 00475 Sample->Right = NULL; 00476 Sample->CharID = CharID; 00477 00478 for (i = 0; i < Clusterer->SampleSize; i++) 00479 Sample->Mean[i] = Feature[i]; 00480 00481 // add the sample to the KD tree - keep track of the total # of samples 00482 Clusterer->NumberOfSamples++; 00483 KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample); 00484 if (CharID >= Clusterer->NumChar) 00485 Clusterer->NumChar = CharID + 1; 00486 00487 // execute hook for monitoring clustering operation 00488 // (*SampleCreationHook)( Sample ); 00489 00490 return (Sample); 00491 } // MakeSample 00492 00493 00515 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { 00516 //only create cluster tree if samples have never been clustered before 00517 if (Clusterer->Root == NULL) 00518 CreateClusterTree(Clusterer); 00519 00520 //deallocate the old prototype list if one exists 00521 FreeProtoList (&Clusterer->ProtoList); 00522 Clusterer->ProtoList = NIL_LIST; 00523 00524 //compute prototypes starting at the root node in the tree 00525 ComputePrototypes(Clusterer, Config); 00526 return (Clusterer->ProtoList); 00527 } // ClusterSamples 00528 00529 00543 void FreeClusterer(CLUSTERER *Clusterer) { 00544 if (Clusterer != NULL) { 00545 memfree (Clusterer->ParamDesc); 00546 if (Clusterer->KDTree != NULL) 00547 FreeKDTree (Clusterer->KDTree); 00548 if (Clusterer->Root != NULL) 00549 FreeCluster (Clusterer->Root); 00550 // Free up all used buckets structures. 00551 for (int d = 0; d < DISTRIBUTION_COUNT; ++d) { 00552 for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c) 00553 if (Clusterer->bucket_cache[d][c] != NULL) 00554 FreeBuckets(Clusterer->bucket_cache[d][c]); 00555 } 00556 00557 memfree(Clusterer); 00558 } 00559 } // FreeClusterer 00560 00561 00571 void FreeProtoList(LIST *ProtoList) { 00572 destroy_nodes(*ProtoList, FreePrototype); 00573 } // FreeProtoList 00574 00575 00586 void FreePrototype(void *arg) { //PROTOTYPE *Prototype) 00587 PROTOTYPE *Prototype = (PROTOTYPE *) arg; 00588 00589 // unmark the corresponding cluster (if there is one 00590 if (Prototype->Cluster != NULL) 00591 Prototype->Cluster->Prototype = FALSE; 00592 00593 // deallocate the prototype statistics and then the prototype itself 00594 if (Prototype->Distrib != NULL) 00595 memfree (Prototype->Distrib); 00596 if (Prototype->Mean != NULL) 00597 memfree (Prototype->Mean); 00598 if (Prototype->Style != spherical) { 00599 if (Prototype->Variance.Elliptical != NULL) 00600 memfree (Prototype->Variance.Elliptical); 00601 if (Prototype->Magnitude.Elliptical != NULL) 00602 memfree (Prototype->Magnitude.Elliptical); 00603 if (Prototype->Weight.Elliptical != NULL) 00604 memfree (Prototype->Weight.Elliptical); 00605 } 00606 memfree(Prototype); 00607 } // FreePrototype 00608 00609 00625 CLUSTER *NextSample(LIST *SearchState) { 00626 CLUSTER *Cluster; 00627 00628 if (*SearchState == NIL_LIST) 00629 return (NULL); 00630 Cluster = (CLUSTER *) first_node (*SearchState); 00631 *SearchState = pop (*SearchState); 00632 while (TRUE) { 00633 if (Cluster->Left == NULL) 00634 return (Cluster); 00635 *SearchState = push (*SearchState, Cluster->Right); 00636 Cluster = Cluster->Left; 00637 } 00638 } // NextSample 00639 00640 00650 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) { 00651 return (Proto->Mean[Dimension]); 00652 } // Mean 00653 00654 00664 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) { 00665 switch (Proto->Style) { 00666 case spherical: 00667 return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical)); 00668 case elliptical: 00669 return ((FLOAT32) 00670 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00671 case mixed: 00672 switch (Proto->Distrib[Dimension]) { 00673 case normal: 00674 return ((FLOAT32) 00675 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00676 case uniform: 00677 case D_random: 00678 return (Proto->Variance.Elliptical[Dimension]); 00679 case DISTRIBUTION_COUNT: 00680 ASSERT_HOST(!"Distribution count not allowed!"); 00681 } 00682 } 00683 return 0.0f; 00684 } // StandardDeviation 00685 00686 00687 /*--------------------------------------------------------------------------- 00688 Private Code 00689 ----------------------------------------------------------------------------*/ 00705 void CreateClusterTree(CLUSTERER *Clusterer) { 00706 ClusteringContext context; 00707 ClusterPair HeapEntry; 00708 TEMPCLUSTER *PotentialCluster; 00709 00710 // each sample and its nearest neighbor form a "potential" cluster 00711 // save these in a heap with the "best" potential clusters on top 00712 context.tree = Clusterer->KDTree; 00713 context.candidates = (TEMPCLUSTER *) 00714 Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER)); 00715 context.next = 0; 00716 context.heap = new ClusterHeap(Clusterer->NumberOfSamples); 00717 KDWalk(context.tree, (void_proc)MakePotentialClusters, &context); 00718 00719 // form potential clusters into actual clusters - always do "best" first 00720 while (context.heap->Pop(&HeapEntry)) { 00721 PotentialCluster = HeapEntry.data; 00722 00723 // if main cluster of potential cluster is already in another cluster 00724 // then we don't need to worry about it 00725 if (PotentialCluster->Cluster->Clustered) { 00726 continue; 00727 } 00728 00729 // if main cluster is not yet clustered, but its nearest neighbor is 00730 // then we must find a new nearest neighbor 00731 else if (PotentialCluster->Neighbor->Clustered) { 00732 PotentialCluster->Neighbor = 00733 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, 00734 &HeapEntry.key); 00735 if (PotentialCluster->Neighbor != NULL) { 00736 context.heap->Push(&HeapEntry); 00737 } 00738 } 00739 00740 // if neither cluster is already clustered, form permanent cluster 00741 else { 00742 PotentialCluster->Cluster = 00743 MakeNewCluster(Clusterer, PotentialCluster); 00744 PotentialCluster->Neighbor = 00745 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, 00746 &HeapEntry.key); 00747 if (PotentialCluster->Neighbor != NULL) { 00748 context.heap->Push(&HeapEntry); 00749 } 00750 } 00751 } 00752 00753 // the root node in the cluster tree is now the only node in the kd-tree 00754 Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree); 00755 00756 // free up the memory used by the K-D tree, heap, and temp clusters 00757 FreeKDTree(context.tree); 00758 Clusterer->KDTree = NULL; 00759 delete context.heap; 00760 memfree(context.candidates); 00761 } // CreateClusterTree 00762 00763 00773 void MakePotentialClusters(ClusteringContext *context, 00774 CLUSTER *Cluster, inT32 Level) { 00775 ClusterPair HeapEntry; 00776 int next = context->next; 00777 context->candidates[next].Cluster = Cluster; 00778 HeapEntry.data = &(context->candidates[next]); 00779 context->candidates[next].Neighbor = 00780 FindNearestNeighbor(context->tree, 00781 context->candidates[next].Cluster, 00782 &HeapEntry.key); 00783 if (context->candidates[next].Neighbor != NULL) { 00784 context->heap->Push(&HeapEntry); 00785 context->next++; 00786 } 00787 } // MakePotentialClusters 00788 00789 00806 CLUSTER * 00807 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance) 00808 #define MAXNEIGHBORS 2 00809 #define MAXDISTANCE MAX_FLOAT32 00810 { 00811 CLUSTER *Neighbor[MAXNEIGHBORS]; 00812 FLOAT32 Dist[MAXNEIGHBORS]; 00813 int NumberOfNeighbors; 00814 inT32 i; 00815 CLUSTER *BestNeighbor; 00816 00817 // find the 2 nearest neighbors of the cluster 00818 KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, 00819 &NumberOfNeighbors, (void **)Neighbor, Dist); 00820 00821 // search for the nearest neighbor that is not the cluster itself 00822 *Distance = MAXDISTANCE; 00823 BestNeighbor = NULL; 00824 for (i = 0; i < NumberOfNeighbors; i++) { 00825 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) { 00826 *Distance = Dist[i]; 00827 BestNeighbor = Neighbor[i]; 00828 } 00829 } 00830 return BestNeighbor; 00831 } // FindNearestNeighbor 00832 00833 00846 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { 00847 CLUSTER *Cluster; 00848 00849 // allocate the new cluster and initialize it 00850 Cluster = (CLUSTER *) Emalloc( 00851 sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32)); 00852 Cluster->Clustered = FALSE; 00853 Cluster->Prototype = FALSE; 00854 Cluster->Left = TempCluster->Cluster; 00855 Cluster->Right = TempCluster->Neighbor; 00856 Cluster->CharID = -1; 00857 00858 // mark the old clusters as "clustered" and delete them from the kd-tree 00859 Cluster->Left->Clustered = TRUE; 00860 Cluster->Right->Clustered = TRUE; 00861 KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left); 00862 KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right); 00863 00864 // compute the mean and sample count for the new cluster 00865 Cluster->SampleCount = 00866 MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, 00867 Cluster->Left->SampleCount, Cluster->Right->SampleCount, 00868 Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean); 00869 00870 // add the new cluster to the KD tree 00871 KDStore(Clusterer->KDTree, Cluster->Mean, Cluster); 00872 return Cluster; 00873 } // MakeNewCluster 00874 00875 00891 inT32 MergeClusters(inT16 N, 00892 PARAM_DESC ParamDesc[], 00893 inT32 n1, 00894 inT32 n2, 00895 FLOAT32 m[], 00896 FLOAT32 m1[], FLOAT32 m2[]) { 00897 inT32 i, n; 00898 00899 n = n1 + n2; 00900 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) { 00901 if (ParamDesc->Circular) { 00902 // if distance between means is greater than allowed 00903 // reduce upper point by one "rotation" to compute mean 00904 // then normalize the mean back into the accepted range 00905 if ((*m2 - *m1) > ParamDesc->HalfRange) { 00906 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n; 00907 if (*m < ParamDesc->Min) 00908 *m += ParamDesc->Range; 00909 } 00910 else if ((*m1 - *m2) > ParamDesc->HalfRange) { 00911 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; 00912 if (*m < ParamDesc->Min) 00913 *m += ParamDesc->Range; 00914 } 00915 else 00916 *m = (n1 * *m1 + n2 * *m2) / n; 00917 } 00918 else 00919 *m = (n1 * *m1 + n2 * *m2) / n; 00920 } 00921 return n; 00922 } // MergeClusters 00923 00924 00936 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { 00937 LIST ClusterStack = NIL_LIST; 00938 CLUSTER *Cluster; 00939 PROTOTYPE *Prototype; 00940 00941 // use a stack to keep track of clusters waiting to be processed 00942 // initially the only cluster on the stack is the root cluster 00943 if (Clusterer->Root != NULL) 00944 ClusterStack = push (NIL_LIST, Clusterer->Root); 00945 00946 // loop until we have analyzed all clusters which are potential prototypes 00947 while (ClusterStack != NIL_LIST) { 00948 // remove the next cluster to be analyzed from the stack 00949 // try to make a prototype from the cluster 00950 // if successful, put it on the proto list, else split the cluster 00951 Cluster = (CLUSTER *) first_node (ClusterStack); 00952 ClusterStack = pop (ClusterStack); 00953 Prototype = MakePrototype(Clusterer, Config, Cluster); 00954 if (Prototype != NULL) { 00955 Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype); 00956 } 00957 else { 00958 ClusterStack = push (ClusterStack, Cluster->Right); 00959 ClusterStack = push (ClusterStack, Cluster->Left); 00960 } 00961 } 00962 } // ComputePrototypes 00963 00964 00982 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, 00983 CLUSTERCONFIG *Config, 00984 CLUSTER *Cluster) { 00985 STATISTICS *Statistics; 00986 PROTOTYPE *Proto; 00987 BUCKETS *Buckets; 00988 00989 // filter out clusters which contain samples from the same character 00990 if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal)) 00991 return NULL; 00992 00993 // compute the covariance matrix and ranges for the cluster 00994 Statistics = 00995 ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); 00996 00997 // check for degenerate clusters which need not be analyzed further 00998 // note that the MinSamples test assumes that all clusters with multiple 00999 // character samples have been removed (as above) 01000 Proto = MakeDegenerateProto( 01001 Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle, 01002 (inT32) (Config->MinSamples * Clusterer->NumChar)); 01003 if (Proto != NULL) { 01004 FreeStatistics(Statistics); 01005 return Proto; 01006 } 01007 // check to ensure that all dimensions are independent 01008 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, 01009 Statistics->CoVariance, Config->Independence)) { 01010 FreeStatistics(Statistics); 01011 return NULL; 01012 } 01013 01014 if (HOTELLING && Config->ProtoStyle == elliptical) { 01015 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics); 01016 if (Proto != NULL) { 01017 FreeStatistics(Statistics); 01018 return Proto; 01019 } 01020 } 01021 01022 // create a histogram data structure used to evaluate distributions 01023 Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, 01024 Config->Confidence); 01025 01026 // create a prototype based on the statistics and test it 01027 switch (Config->ProtoStyle) { 01028 case spherical: 01029 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); 01030 break; 01031 case elliptical: 01032 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); 01033 break; 01034 case mixed: 01035 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, 01036 Config->Confidence); 01037 break; 01038 case automatic: 01039 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); 01040 if (Proto != NULL) 01041 break; 01042 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); 01043 if (Proto != NULL) 01044 break; 01045 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, 01046 Config->Confidence); 01047 break; 01048 } 01049 FreeStatistics(Statistics); 01050 return Proto; 01051 } // MakePrototype 01052 01053 01077 PROTOTYPE *MakeDegenerateProto( //this was MinSample 01078 uinT16 N, 01079 CLUSTER *Cluster, 01080 STATISTICS *Statistics, 01081 PROTOSTYLE Style, 01082 inT32 MinSamples) { 01083 PROTOTYPE *Proto = NULL; 01084 01085 if (MinSamples < MINSAMPLESNEEDED) 01086 MinSamples = MINSAMPLESNEEDED; 01087 01088 if (Cluster->SampleCount < MinSamples) { 01089 switch (Style) { 01090 case spherical: 01091 Proto = NewSphericalProto (N, Cluster, Statistics); 01092 break; 01093 case elliptical: 01094 case automatic: 01095 Proto = NewEllipticalProto (N, Cluster, Statistics); 01096 break; 01097 case mixed: 01098 Proto = NewMixedProto (N, Cluster, Statistics); 01099 break; 01100 } 01101 Proto->Significant = FALSE; 01102 } 01103 return (Proto); 01104 } // MakeDegenerateProto 01105 01119 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, 01120 CLUSTERCONFIG *Config, 01121 CLUSTER *Cluster, 01122 STATISTICS *Statistics) { 01123 // Fraction of the number of samples used as a range around 1 within 01124 // which a cluster has the magic size that allows a boost to the 01125 // FTable by kFTableBoostMargin, thus allowing clusters near the 01126 // magic size (equal to the number of sample characters) to be more 01127 // likely to stay together. 01128 const double kMagicSampleMargin = 0.0625; 01129 const double kFTableBoostMargin = 2.0; 01130 01131 int N = Clusterer->SampleSize; 01132 CLUSTER* Left = Cluster->Left; 01133 CLUSTER* Right = Cluster->Right; 01134 if (Left == NULL || Right == NULL) 01135 return NULL; 01136 int TotalDims = Left->SampleCount + Right->SampleCount; 01137 if (TotalDims < N + 1 || TotalDims < 2) 01138 return NULL; 01139 const int kMatrixSize = N * N * sizeof(FLOAT32); 01140 FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); 01141 FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); 01142 FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32))); 01143 // Compute a new covariance matrix that only uses essential features. 01144 for (int i = 0; i < N; ++i) { 01145 int row_offset = i * N; 01146 if (!Clusterer->ParamDesc[i].NonEssential) { 01147 for (int j = 0; j < N; ++j) { 01148 if (!Clusterer->ParamDesc[j].NonEssential) 01149 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset]; 01150 else 01151 Covariance[j + row_offset] = 0.0f; 01152 } 01153 } else { 01154 for (int j = 0; j < N; ++j) { 01155 if (i == j) 01156 Covariance[j + row_offset] = 1.0f; 01157 else 01158 Covariance[j + row_offset] = 0.0f; 01159 } 01160 } 01161 } 01162 double err = InvertMatrix(Covariance, N, Inverse); 01163 if (err > 1) { 01164 tprintf("Clustering error: Matrix inverse failed with error %g\n", err); 01165 } 01166 int EssentialN = 0; 01167 for (int dim = 0; dim < N; ++dim) { 01168 if (!Clusterer->ParamDesc[dim].NonEssential) { 01169 Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; 01170 ++EssentialN; 01171 } else { 01172 Delta[dim] = 0.0f; 01173 } 01174 } 01175 // Compute Hotelling's T-squared. 01176 double Tsq = 0.0; 01177 for (int x = 0; x < N; ++x) { 01178 double temp = 0.0; 01179 for (int y = 0; y < N; ++y) { 01180 temp += Inverse[y + N*x] * Delta[y]; 01181 } 01182 Tsq += Delta[x] * temp; 01183 } 01184 memfree(Covariance); 01185 memfree(Inverse); 01186 memfree(Delta); 01187 // Changed this function to match the formula in 01188 // Statistical Methods in Medical Research p 473 01189 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews. 01190 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; 01191 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN); 01192 int Fx = EssentialN; 01193 if (Fx > FTABLE_X) 01194 Fx = FTABLE_X; 01195 --Fx; 01196 int Fy = TotalDims - EssentialN - 1; 01197 if (Fy > FTABLE_Y) 01198 Fy = FTABLE_Y; 01199 --Fy; 01200 double FTarget = FTable[Fy][Fx]; 01201 if (Config->MagicSamples > 0 && 01202 TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) && 01203 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) { 01204 // Give magic-sized clusters a magic FTable boost. 01205 FTarget += kFTableBoostMargin; 01206 } 01207 if (F < FTarget) { 01208 return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01209 } 01210 return NULL; 01211 } 01212 01226 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, 01227 CLUSTER *Cluster, 01228 STATISTICS *Statistics, 01229 BUCKETS *Buckets) { 01230 PROTOTYPE *Proto = NULL; 01231 int i; 01232 01233 // check that each dimension is a normal distribution 01234 for (i = 0; i < Clusterer->SampleSize; i++) { 01235 if (Clusterer->ParamDesc[i].NonEssential) 01236 continue; 01237 01238 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01239 Cluster->Mean[i], 01240 sqrt ((FLOAT64) (Statistics->AvgVariance))); 01241 if (!DistributionOK (Buckets)) 01242 break; 01243 } 01244 // if all dimensions matched a normal distribution, make a proto 01245 if (i >= Clusterer->SampleSize) 01246 Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics); 01247 return (Proto); 01248 } // MakeSphericalProto 01249 01250 01264 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, 01265 CLUSTER *Cluster, 01266 STATISTICS *Statistics, 01267 BUCKETS *Buckets) { 01268 PROTOTYPE *Proto = NULL; 01269 int i; 01270 01271 // check that each dimension is a normal distribution 01272 for (i = 0; i < Clusterer->SampleSize; i++) { 01273 if (Clusterer->ParamDesc[i].NonEssential) 01274 continue; 01275 01276 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01277 Cluster->Mean[i], 01278 sqrt ((FLOAT64) Statistics-> 01279 CoVariance[i * (Clusterer->SampleSize + 1)])); 01280 if (!DistributionOK (Buckets)) 01281 break; 01282 } 01283 // if all dimensions matched a normal distribution, make a proto 01284 if (i >= Clusterer->SampleSize) 01285 Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01286 return (Proto); 01287 } // MakeEllipticalProto 01288 01289 01307 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, 01308 CLUSTER *Cluster, 01309 STATISTICS *Statistics, 01310 BUCKETS *NormalBuckets, 01311 FLOAT64 Confidence) { 01312 PROTOTYPE *Proto; 01313 int i; 01314 BUCKETS *UniformBuckets = NULL; 01315 BUCKETS *RandomBuckets = NULL; 01316 01317 // create a mixed proto to work on - initially assume all dimensions normal*/ 01318 Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics); 01319 01320 // find the proper distribution for each dimension 01321 for (i = 0; i < Clusterer->SampleSize; i++) { 01322 if (Clusterer->ParamDesc[i].NonEssential) 01323 continue; 01324 01325 FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01326 Proto->Mean[i], 01327 sqrt ((FLOAT64) Proto->Variance.Elliptical[i])); 01328 if (DistributionOK (NormalBuckets)) 01329 continue; 01330 01331 if (RandomBuckets == NULL) 01332 RandomBuckets = 01333 GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence); 01334 MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i])); 01335 FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01336 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01337 if (DistributionOK (RandomBuckets)) 01338 continue; 01339 01340 if (UniformBuckets == NULL) 01341 UniformBuckets = 01342 GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence); 01343 MakeDimUniform(i, Proto, Statistics); 01344 FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01345 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01346 if (DistributionOK (UniformBuckets)) 01347 continue; 01348 break; 01349 } 01350 // if any dimension failed to match a distribution, discard the proto 01351 if (i < Clusterer->SampleSize) { 01352 FreePrototype(Proto); 01353 Proto = NULL; 01354 } 01355 return (Proto); 01356 } // MakeMixedProto 01357 01358 01369 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { 01370 Proto->Distrib[i] = D_random; 01371 Proto->Mean[i] = ParamDesc->MidRange; 01372 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange; 01373 01374 // subtract out the previous magnitude of this dimension from the total 01375 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01376 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range; 01377 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01378 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01379 01380 // note that the proto Weight is irrelevant for D_random protos 01381 } // MakeDimRandom 01382 01383 01394 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) { 01395 Proto->Distrib[i] = uniform; 01396 Proto->Mean[i] = Proto->Cluster->Mean[i] + 01397 (Statistics->Min[i] + Statistics->Max[i]) / 2; 01398 Proto->Variance.Elliptical[i] = 01399 (Statistics->Max[i] - Statistics->Min[i]) / 2; 01400 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01401 Proto->Variance.Elliptical[i] = MINVARIANCE; 01402 01403 // subtract out the previous magnitude of this dimension from the total 01404 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01405 Proto->Magnitude.Elliptical[i] = 01406 1.0 / (2.0 * Proto->Variance.Elliptical[i]); 01407 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01408 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01409 01410 // note that the proto Weight is irrelevant for uniform protos 01411 } // MakeDimUniform 01412 01413 01430 STATISTICS * 01431 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) { 01432 STATISTICS *Statistics; 01433 int i, j; 01434 FLOAT32 *CoVariance; 01435 FLOAT32 *Distance; 01436 LIST SearchState; 01437 SAMPLE *Sample; 01438 uinT32 SampleCountAdjustedForBias; 01439 01440 // allocate memory to hold the statistics results 01441 Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS)); 01442 Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32)); 01443 Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01444 Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01445 01446 // allocate temporary memory to hold the sample to mean distances 01447 Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01448 01449 // initialize the statistics 01450 Statistics->AvgVariance = 1.0; 01451 CoVariance = Statistics->CoVariance; 01452 for (i = 0; i < N; i++) { 01453 Statistics->Min[i] = 0.0; 01454 Statistics->Max[i] = 0.0; 01455 for (j = 0; j < N; j++, CoVariance++) 01456 *CoVariance = 0; 01457 } 01458 // find each sample in the cluster and merge it into the statistics 01459 InitSampleSearch(SearchState, Cluster); 01460 while ((Sample = NextSample (&SearchState)) != NULL) { 01461 for (i = 0; i < N; i++) { 01462 Distance[i] = Sample->Mean[i] - Cluster->Mean[i]; 01463 if (ParamDesc[i].Circular) { 01464 if (Distance[i] > ParamDesc[i].HalfRange) 01465 Distance[i] -= ParamDesc[i].Range; 01466 if (Distance[i] < -ParamDesc[i].HalfRange) 01467 Distance[i] += ParamDesc[i].Range; 01468 } 01469 if (Distance[i] < Statistics->Min[i]) 01470 Statistics->Min[i] = Distance[i]; 01471 if (Distance[i] > Statistics->Max[i]) 01472 Statistics->Max[i] = Distance[i]; 01473 } 01474 CoVariance = Statistics->CoVariance; 01475 for (i = 0; i < N; i++) 01476 for (j = 0; j < N; j++, CoVariance++) 01477 *CoVariance += Distance[i] * Distance[j]; 01478 } 01479 // normalize the variances by the total number of samples 01480 // use SampleCount-1 instead of SampleCount to get an unbiased estimate 01481 // also compute the geometic mean of the diagonal variances 01482 // ensure that clusters with only 1 sample are handled correctly 01483 if (Cluster->SampleCount > 1) 01484 SampleCountAdjustedForBias = Cluster->SampleCount - 1; 01485 else 01486 SampleCountAdjustedForBias = 1; 01487 CoVariance = Statistics->CoVariance; 01488 for (i = 0; i < N; i++) 01489 for (j = 0; j < N; j++, CoVariance++) { 01490 *CoVariance /= SampleCountAdjustedForBias; 01491 if (j == i) { 01492 if (*CoVariance < MINVARIANCE) 01493 *CoVariance = MINVARIANCE; 01494 Statistics->AvgVariance *= *CoVariance; 01495 } 01496 } 01497 Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance, 01498 1.0 / N); 01499 01500 // release temporary memory and return 01501 memfree(Distance); 01502 return (Statistics); 01503 } // ComputeStatistics 01504 01505 01519 PROTOTYPE *NewSphericalProto(uinT16 N, 01520 CLUSTER *Cluster, 01521 STATISTICS *Statistics) { 01522 PROTOTYPE *Proto; 01523 01524 Proto = NewSimpleProto (N, Cluster); 01525 01526 Proto->Variance.Spherical = Statistics->AvgVariance; 01527 if (Proto->Variance.Spherical < MINVARIANCE) 01528 Proto->Variance.Spherical = MINVARIANCE; 01529 01530 Proto->Magnitude.Spherical = 01531 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical)); 01532 Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical, 01533 (double) N); 01534 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; 01535 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01536 01537 return (Proto); 01538 } // NewSphericalProto 01539 01540 01553 PROTOTYPE *NewEllipticalProto(inT16 N, 01554 CLUSTER *Cluster, 01555 STATISTICS *Statistics) { 01556 PROTOTYPE *Proto; 01557 FLOAT32 *CoVariance; 01558 int i; 01559 01560 Proto = NewSimpleProto (N, Cluster); 01561 Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01562 Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01563 Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01564 01565 CoVariance = Statistics->CoVariance; 01566 Proto->TotalMagnitude = 1.0; 01567 for (i = 0; i < N; i++, CoVariance += N + 1) { 01568 Proto->Variance.Elliptical[i] = *CoVariance; 01569 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01570 Proto->Variance.Elliptical[i] = MINVARIANCE; 01571 01572 Proto->Magnitude.Elliptical[i] = 01573 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i])); 01574 Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i]; 01575 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01576 } 01577 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01578 Proto->Style = elliptical; 01579 return (Proto); 01580 } // NewEllipticalProto 01581 01582 01598 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) { 01599 PROTOTYPE *Proto; 01600 int i; 01601 01602 Proto = NewEllipticalProto (N, Cluster, Statistics); 01603 Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION)); 01604 01605 for (i = 0; i < N; i++) { 01606 Proto->Distrib[i] = normal; 01607 } 01608 Proto->Style = mixed; 01609 return (Proto); 01610 } // NewMixedProto 01611 01612 01623 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) { 01624 PROTOTYPE *Proto; 01625 int i; 01626 01627 Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE)); 01628 Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01629 01630 for (i = 0; i < N; i++) 01631 Proto->Mean[i] = Cluster->Mean[i]; 01632 Proto->Distrib = NULL; 01633 01634 Proto->Significant = TRUE; 01635 Proto->Merged = FALSE; 01636 Proto->Style = spherical; 01637 Proto->NumSamples = Cluster->SampleCount; 01638 Proto->Cluster = Cluster; 01639 Proto->Cluster->Prototype = TRUE; 01640 return (Proto); 01641 } // NewSimpleProto 01642 01643 01664 BOOL8 01665 Independent (PARAM_DESC ParamDesc[], 01666 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) { 01667 int i, j; 01668 FLOAT32 *VARii; // points to ith on-diagonal element 01669 FLOAT32 *VARjj; // points to jth on-diagonal element 01670 FLOAT32 CorrelationCoeff; 01671 01672 VARii = CoVariance; 01673 for (i = 0; i < N; i++, VARii += N + 1) { 01674 if (ParamDesc[i].NonEssential) 01675 continue; 01676 01677 VARjj = VARii + N + 1; 01678 CoVariance = VARii + 1; 01679 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) { 01680 if (ParamDesc[j].NonEssential) 01681 continue; 01682 01683 if ((*VARii == 0.0) || (*VARjj == 0.0)) 01684 CorrelationCoeff = 0.0; 01685 else 01686 CorrelationCoeff = 01687 sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj))); 01688 if (CorrelationCoeff > Independence) 01689 return (FALSE); 01690 } 01691 } 01692 return (TRUE); 01693 } // Independent 01694 01695 01713 BUCKETS *GetBuckets(CLUSTERER* clusterer, 01714 DISTRIBUTION Distribution, 01715 uinT32 SampleCount, 01716 FLOAT64 Confidence) { 01717 // Get an old bucket structure with the same number of buckets. 01718 uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); 01719 BUCKETS *Buckets = 01720 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS]; 01721 01722 // If a matching bucket structure is not found, make one and save it. 01723 if (Buckets == NULL) { 01724 Buckets = MakeBuckets(Distribution, SampleCount, Confidence); 01725 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = 01726 Buckets; 01727 } else { 01728 // Just adjust the existing buckets. 01729 if (SampleCount != Buckets->SampleCount) 01730 AdjustBuckets(Buckets, SampleCount); 01731 if (Confidence != Buckets->Confidence) { 01732 Buckets->Confidence = Confidence; 01733 Buckets->ChiSquared = ComputeChiSquared( 01734 DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), 01735 Confidence); 01736 } 01737 InitBuckets(Buckets); 01738 } 01739 return Buckets; 01740 } // GetBuckets 01741 01742 01761 BUCKETS *MakeBuckets(DISTRIBUTION Distribution, 01762 uinT32 SampleCount, 01763 FLOAT64 Confidence) { 01764 const DENSITYFUNC DensityFunction[] = 01765 { NormalDensity, UniformDensity, UniformDensity }; 01766 int i, j; 01767 BUCKETS *Buckets; 01768 FLOAT64 BucketProbability; 01769 FLOAT64 NextBucketBoundary; 01770 FLOAT64 Probability; 01771 FLOAT64 ProbabilityDelta; 01772 FLOAT64 LastProbDensity; 01773 FLOAT64 ProbDensity; 01774 uinT16 CurrentBucket; 01775 BOOL8 Symmetrical; 01776 01777 // allocate memory needed for data structure 01778 Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS))); 01779 Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); 01780 Buckets->SampleCount = SampleCount; 01781 Buckets->Confidence = Confidence; 01782 Buckets->Count = reinterpret_cast<uinT32*>( 01783 Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32))); 01784 Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>( 01785 Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32))); 01786 01787 // initialize simple fields 01788 Buckets->Distribution = Distribution; 01789 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 01790 Buckets->Count[i] = 0; 01791 Buckets->ExpectedCount[i] = 0.0; 01792 } 01793 01794 // all currently defined distributions are symmetrical 01795 Symmetrical = TRUE; 01796 Buckets->ChiSquared = ComputeChiSquared( 01797 DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); 01798 01799 if (Symmetrical) { 01800 // allocate buckets so that all have approx. equal probability 01801 BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets); 01802 01803 // distribution is symmetric so fill in upper half then copy 01804 CurrentBucket = Buckets->NumberOfBuckets / 2; 01805 if (Odd (Buckets->NumberOfBuckets)) 01806 NextBucketBoundary = BucketProbability / 2; 01807 else 01808 NextBucketBoundary = BucketProbability; 01809 01810 Probability = 0.0; 01811 LastProbDensity = 01812 (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2); 01813 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) { 01814 ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1); 01815 ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0); 01816 Probability += ProbabilityDelta; 01817 if (Probability > NextBucketBoundary) { 01818 if (CurrentBucket < Buckets->NumberOfBuckets - 1) 01819 CurrentBucket++; 01820 NextBucketBoundary += BucketProbability; 01821 } 01822 Buckets->Bucket[i] = CurrentBucket; 01823 Buckets->ExpectedCount[CurrentBucket] += 01824 (FLOAT32) (ProbabilityDelta * SampleCount); 01825 LastProbDensity = ProbDensity; 01826 } 01827 // place any leftover probability into the last bucket 01828 Buckets->ExpectedCount[CurrentBucket] += 01829 (FLOAT32) ((0.5 - Probability) * SampleCount); 01830 01831 // copy upper half of distribution to lower half 01832 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) 01833 Buckets->Bucket[i] = 01834 Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets); 01835 01836 // copy upper half of expected counts to lower half 01837 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) 01838 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j]; 01839 } 01840 return Buckets; 01841 } // MakeBuckets 01842 01843 01859 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) { 01860 uinT8 Last, Next; 01861 FLOAT32 Slope; 01862 01863 if (SampleCount < kCountTable[0]) 01864 return kBucketsTable[0]; 01865 01866 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) { 01867 if (SampleCount <= kCountTable[Next]) { 01868 Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) / 01869 (FLOAT32) (kCountTable[Next] - kCountTable[Last]); 01870 return ((uinT16) (kBucketsTable[Last] + 01871 Slope * (SampleCount - kCountTable[Last]))); 01872 } 01873 } 01874 return kBucketsTable[Last]; 01875 } // OptimumNumberOfBuckets 01876 01877 01896 FLOAT64 01897 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha) 01898 #define CHIACCURACY 0.01 01899 #define MINALPHA (1e-200) 01900 { 01901 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1]; 01902 01903 CHISTRUCT *OldChiSquared; 01904 CHISTRUCT SearchKey; 01905 01906 // limit the minimum alpha that can be used - if alpha is too small 01907 // it may not be possible to compute chi-squared. 01908 Alpha = ClipToRange(Alpha, MINALPHA, 1.0); 01909 if (Odd (DegreesOfFreedom)) 01910 DegreesOfFreedom++; 01911 01912 /* find the list of chi-squared values which have already been computed 01913 for the specified number of degrees of freedom. Search the list for 01914 the desired chi-squared. */ 01915 SearchKey.Alpha = Alpha; 01916 OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom], 01917 &SearchKey, AlphaMatch)); 01918 01919 if (OldChiSquared == NULL) { 01920 OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha); 01921 OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared, 01922 (FLOAT64) DegreesOfFreedom, 01923 (FLOAT64) CHIACCURACY); 01924 ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom], 01925 OldChiSquared); 01926 } 01927 else { 01928 // further optimization might move OldChiSquared to front of list 01929 } 01930 01931 return (OldChiSquared->ChiSquared); 01932 01933 } // ComputeChiSquared 01934 01935 01951 FLOAT64 NormalDensity(inT32 x) { 01952 FLOAT64 Distance; 01953 01954 Distance = x - kNormalMean; 01955 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance); 01956 } // NormalDensity 01957 01958 01968 FLOAT64 UniformDensity(inT32 x) { 01969 static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE; 01970 01971 if ((x >= 0.0) && (x <= BUCKETTABLESIZE)) 01972 return UniformDistributionDensity; 01973 else 01974 return (FLOAT64) 0.0; 01975 } // UniformDensity 01976 01977 01988 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) { 01989 return (f1 + f2) * Dx / 2.0; 01990 } // Integral 01991 01992 02015 void FillBuckets(BUCKETS *Buckets, 02016 CLUSTER *Cluster, 02017 uinT16 Dim, 02018 PARAM_DESC *ParamDesc, 02019 FLOAT32 Mean, 02020 FLOAT32 StdDev) { 02021 uinT16 BucketID; 02022 int i; 02023 LIST SearchState; 02024 SAMPLE *Sample; 02025 02026 // initialize the histogram bucket counts to 0 02027 for (i = 0; i < Buckets->NumberOfBuckets; i++) 02028 Buckets->Count[i] = 0; 02029 02030 if (StdDev == 0.0) { 02031 /* if the standard deviation is zero, then we can't statistically 02032 analyze the cluster. Use a pseudo-analysis: samples exactly on 02033 the mean are distributed evenly across all buckets. Samples greater 02034 than the mean are placed in the last bucket; samples less than the 02035 mean are placed in the first bucket. */ 02036 02037 InitSampleSearch(SearchState, Cluster); 02038 i = 0; 02039 while ((Sample = NextSample (&SearchState)) != NULL) { 02040 if (Sample->Mean[Dim] > Mean) 02041 BucketID = Buckets->NumberOfBuckets - 1; 02042 else if (Sample->Mean[Dim] < Mean) 02043 BucketID = 0; 02044 else 02045 BucketID = i; 02046 Buckets->Count[BucketID] += 1; 02047 i++; 02048 if (i >= Buckets->NumberOfBuckets) 02049 i = 0; 02050 } 02051 } 02052 else { 02053 // search for all samples in the cluster and add to histogram buckets 02054 InitSampleSearch(SearchState, Cluster); 02055 while ((Sample = NextSample (&SearchState)) != NULL) { 02056 switch (Buckets->Distribution) { 02057 case normal: 02058 BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim], 02059 Mean, StdDev); 02060 break; 02061 case D_random: 02062 case uniform: 02063 BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim], 02064 Mean, StdDev); 02065 break; 02066 default: 02067 BucketID = 0; 02068 } 02069 Buckets->Count[Buckets->Bucket[BucketID]] += 1; 02070 } 02071 } 02072 } // FillBuckets 02073 02074 02088 uinT16 NormalBucket(PARAM_DESC *ParamDesc, 02089 FLOAT32 x, 02090 FLOAT32 Mean, 02091 FLOAT32 StdDev) { 02092 FLOAT32 X; 02093 02094 // wraparound circular parameters if necessary 02095 if (ParamDesc->Circular) { 02096 if (x - Mean > ParamDesc->HalfRange) 02097 x -= ParamDesc->Range; 02098 else if (x - Mean < -ParamDesc->HalfRange) 02099 x += ParamDesc->Range; 02100 } 02101 02102 X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean; 02103 if (X < 0) 02104 return 0; 02105 if (X > BUCKETTABLESIZE - 1) 02106 return ((uinT16) (BUCKETTABLESIZE - 1)); 02107 return (uinT16) floor((FLOAT64) X); 02108 } // NormalBucket 02109 02110 02124 uinT16 UniformBucket(PARAM_DESC *ParamDesc, 02125 FLOAT32 x, 02126 FLOAT32 Mean, 02127 FLOAT32 StdDev) { 02128 FLOAT32 X; 02129 02130 // wraparound circular parameters if necessary 02131 if (ParamDesc->Circular) { 02132 if (x - Mean > ParamDesc->HalfRange) 02133 x -= ParamDesc->Range; 02134 else if (x - Mean < -ParamDesc->HalfRange) 02135 x += ParamDesc->Range; 02136 } 02137 02138 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0); 02139 if (X < 0) 02140 return 0; 02141 if (X > BUCKETTABLESIZE - 1) 02142 return (uinT16) (BUCKETTABLESIZE - 1); 02143 return (uinT16) floor((FLOAT64) X); 02144 } // UniformBucket 02145 02146 02159 BOOL8 DistributionOK(BUCKETS *Buckets) { 02160 FLOAT32 FrequencyDifference; 02161 FLOAT32 TotalDifference; 02162 int i; 02163 02164 // compute how well the histogram matches the expected histogram 02165 TotalDifference = 0.0; 02166 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02167 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i]; 02168 TotalDifference += (FrequencyDifference * FrequencyDifference) / 02169 Buckets->ExpectedCount[i]; 02170 } 02171 02172 // test to see if the difference is more than expected 02173 if (TotalDifference > Buckets->ChiSquared) 02174 return FALSE; 02175 else 02176 return TRUE; 02177 } // DistributionOK 02178 02179 02188 void FreeStatistics(STATISTICS *Statistics) { 02189 memfree (Statistics->CoVariance); 02190 memfree (Statistics->Min); 02191 memfree (Statistics->Max); 02192 memfree(Statistics); 02193 } // FreeStatistics 02194 02195 02201 void FreeBuckets(BUCKETS *buckets) { 02202 Efree(buckets->Count); 02203 Efree(buckets->ExpectedCount); 02204 Efree(buckets); 02205 } // FreeBuckets 02206 02207 02220 void FreeCluster(CLUSTER *Cluster) { 02221 if (Cluster != NULL) { 02222 FreeCluster (Cluster->Left); 02223 FreeCluster (Cluster->Right); 02224 memfree(Cluster); 02225 } 02226 } // FreeCluster 02227 02228 02243 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) { 02244 static uinT8 DegreeOffsets[] = { 3, 3, 1 }; 02245 02246 uinT16 AdjustedNumBuckets; 02247 02248 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution]; 02249 if (Odd (AdjustedNumBuckets)) 02250 AdjustedNumBuckets++; 02251 return (AdjustedNumBuckets); 02252 02253 } // DegreesOfFreedom 02254 02255 02266 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram, 02267 void *arg2) { // uinT16 *DesiredNumberOfBuckets) 02268 BUCKETS *Histogram = (BUCKETS *) arg1; 02269 uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2; 02270 02271 return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets); 02272 02273 } // NumBucketsMatch 02274 02275 02284 int ListEntryMatch(void *arg1, //ListNode 02285 void *arg2) { //Key 02286 return (arg1 == arg2); 02287 02288 } // ListEntryMatch 02289 02290 02301 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) { 02302 int i; 02303 FLOAT64 AdjustFactor; 02304 02305 AdjustFactor = (((FLOAT64) NewSampleCount) / 02306 ((FLOAT64) Buckets->SampleCount)); 02307 02308 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02309 Buckets->ExpectedCount[i] *= AdjustFactor; 02310 } 02311 02312 Buckets->SampleCount = NewSampleCount; 02313 02314 } // AdjustBuckets 02315 02316 02325 void InitBuckets(BUCKETS *Buckets) { 02326 int i; 02327 02328 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02329 Buckets->Count[i] = 0; 02330 } 02331 02332 } // InitBuckets 02333 02334 02349 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct, 02350 void *arg2) { //CHISTRUCT *SearchKey) 02351 CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1; 02352 CHISTRUCT *SearchKey = (CHISTRUCT *) arg2; 02353 02354 return (ChiStruct->Alpha == SearchKey->Alpha); 02355 02356 } // AlphaMatch 02357 02358 02370 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) { 02371 CHISTRUCT *NewChiStruct; 02372 02373 NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT)); 02374 NewChiStruct->DegreesOfFreedom = DegreesOfFreedom; 02375 NewChiStruct->Alpha = Alpha; 02376 return (NewChiStruct); 02377 02378 } // NewChiStruct 02379 02380 02396 FLOAT64 02397 Solve (SOLVEFUNC Function, 02398 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy) 02399 #define INITIALDELTA 0.1 02400 #define DELTARATIO 0.1 02401 { 02402 FLOAT64 x; 02403 FLOAT64 f; 02404 FLOAT64 Slope; 02405 FLOAT64 Delta; 02406 FLOAT64 NewDelta; 02407 FLOAT64 xDelta; 02408 FLOAT64 LastPosX, LastNegX; 02409 02410 x = InitialGuess; 02411 Delta = INITIALDELTA; 02412 LastPosX = MAX_FLOAT32; 02413 LastNegX = -MAX_FLOAT32; 02414 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02415 while (Abs (LastPosX - LastNegX) > Accuracy) { 02416 // keep track of outer bounds of current estimate 02417 if (f < 0) 02418 LastNegX = x; 02419 else 02420 LastPosX = x; 02421 02422 // compute the approx. slope of f(x) at the current point 02423 Slope = 02424 ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta; 02425 02426 // compute the next solution guess */ 02427 xDelta = f / Slope; 02428 x -= xDelta; 02429 02430 // reduce the delta used for computing slope to be a fraction of 02431 //the amount moved to get to the new guess 02432 NewDelta = Abs (xDelta) * DELTARATIO; 02433 if (NewDelta < Delta) 02434 Delta = NewDelta; 02435 02436 // compute the value of the function at the new guess 02437 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02438 } 02439 return (x); 02440 02441 } // Solve 02442 02443 02464 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) { 02465 int i, N; 02466 FLOAT64 SeriesTotal; 02467 FLOAT64 Denominator; 02468 FLOAT64 PowerOfx; 02469 02470 N = ChiParams->DegreesOfFreedom / 2 - 1; 02471 SeriesTotal = 1; 02472 Denominator = 1; 02473 PowerOfx = 1; 02474 for (i = 1; i <= N; i++) { 02475 Denominator *= 2 * i; 02476 PowerOfx *= x; 02477 SeriesTotal += PowerOfx / Denominator; 02478 } 02479 return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha); 02480 02481 } // ChiArea 02482 02483 02511 BOOL8 02512 MultipleCharSamples (CLUSTERER * Clusterer, 02513 CLUSTER * Cluster, FLOAT32 MaxIllegal) 02514 #define ILLEGAL_CHAR 2 02515 { 02516 static BOOL8 *CharFlags = NULL; 02517 static inT32 NumFlags = 0; 02518 int i; 02519 LIST SearchState; 02520 SAMPLE *Sample; 02521 inT32 CharID; 02522 inT32 NumCharInCluster; 02523 inT32 NumIllegalInCluster; 02524 FLOAT32 PercentIllegal; 02525 02526 // initial estimate assumes that no illegal chars exist in the cluster 02527 NumCharInCluster = Cluster->SampleCount; 02528 NumIllegalInCluster = 0; 02529 02530 if (Clusterer->NumChar > NumFlags) { 02531 if (CharFlags != NULL) 02532 memfree(CharFlags); 02533 NumFlags = Clusterer->NumChar; 02534 CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8)); 02535 } 02536 02537 for (i = 0; i < NumFlags; i++) 02538 CharFlags[i] = FALSE; 02539 02540 // find each sample in the cluster and check if we have seen it before 02541 InitSampleSearch(SearchState, Cluster); 02542 while ((Sample = NextSample (&SearchState)) != NULL) { 02543 CharID = Sample->CharID; 02544 if (CharFlags[CharID] == FALSE) { 02545 CharFlags[CharID] = TRUE; 02546 } 02547 else { 02548 if (CharFlags[CharID] == TRUE) { 02549 NumIllegalInCluster++; 02550 CharFlags[CharID] = ILLEGAL_CHAR; 02551 } 02552 NumCharInCluster--; 02553 PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster; 02554 if (PercentIllegal > MaxIllegal) { 02555 destroy(SearchState); 02556 return (TRUE); 02557 } 02558 } 02559 } 02560 return (FALSE); 02561 02562 } // MultipleCharSamples 02563 02569 double InvertMatrix(const float* input, int size, float* inv) { 02570 // Allocate memory for the 2D arrays. 02571 GENERIC_2D_ARRAY<double> U(size, size, 0.0); 02572 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0); 02573 GENERIC_2D_ARRAY<double> L(size, size, 0.0); 02574 02575 // Initialize the working matrices. U starts as input, L as I and U_inv as O. 02576 int row; 02577 int col; 02578 for (row = 0; row < size; row++) { 02579 for (col = 0; col < size; col++) { 02580 U[row][col] = input[row*size + col]; 02581 L[row][col] = row == col ? 1.0 : 0.0; 02582 U_inv[row][col] = 0.0; 02583 } 02584 } 02585 02586 // Compute forward matrix by inversion by LU decomposition of input. 02587 for (col = 0; col < size; ++col) { 02588 // Find best pivot 02589 int best_row = 0; 02590 double best_pivot = -1.0; 02591 for (row = col; row < size; ++row) { 02592 if (Abs(U[row][col]) > best_pivot) { 02593 best_pivot = Abs(U[row][col]); 02594 best_row = row; 02595 } 02596 } 02597 // Exchange pivot rows. 02598 if (best_row != col) { 02599 for (int k = 0; k < size; ++k) { 02600 double tmp = U[best_row][k]; 02601 U[best_row][k] = U[col][k]; 02602 U[col][k] = tmp; 02603 tmp = L[best_row][k]; 02604 L[best_row][k] = L[col][k]; 02605 L[col][k] = tmp; 02606 } 02607 } 02608 // Now do the pivot itself. 02609 for (row = col + 1; row < size; ++row) { 02610 double ratio = -U[row][col] / U[col][col]; 02611 for (int j = col; j < size; ++j) { 02612 U[row][j] += U[col][j] * ratio; 02613 } 02614 for (int k = 0; k < size; ++k) { 02615 L[row][k] += L[col][k] * ratio; 02616 } 02617 } 02618 } 02619 // Next invert U. 02620 for (col = 0; col < size; ++col) { 02621 U_inv[col][col] = 1.0 / U[col][col]; 02622 for (row = col - 1; row >= 0; --row) { 02623 double total = 0.0; 02624 for (int k = col; k > row; --k) { 02625 total += U[row][k] * U_inv[k][col]; 02626 } 02627 U_inv[row][col] = -total / U[row][row]; 02628 } 02629 } 02630 // Now the answer is U_inv.L. 02631 for (row = 0; row < size; row++) { 02632 for (col = 0; col < size; col++) { 02633 double sum = 0.0; 02634 for (int k = row; k < size; ++k) { 02635 sum += U_inv[row][k] * L[k][col]; 02636 } 02637 inv[row*size + col] = sum; 02638 } 02639 } 02640 // Check matrix product. 02641 double error_sum = 0.0; 02642 for (row = 0; row < size; row++) { 02643 for (col = 0; col < size; col++) { 02644 double sum = 0.0; 02645 for (int k = 0; k < size; ++k) { 02646 sum += input[row*size + k] * inv[k *size + col]; 02647 } 02648 if (row != col) { 02649 error_sum += Abs(sum); 02650 } 02651 } 02652 } 02653 return error_sum; 02654 }