tesseract 3.04.01

classify/cluster.cpp

Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:       cluster.c
00003  **     Purpose:        Routines for clustering points in N-D space
00004  **     Author:         Dan Johnson
00005  **     History:        5/29/89, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 #include "const.h"
00019 #include "cluster.h"
00020 #include "emalloc.h"
00021 #include "genericheap.h"
00022 #include "helpers.h"
00023 #include "kdpair.h"
00024 #include "matrix.h"
00025 #include "tprintf.h"
00026 #include "danerror.h"
00027 #include "freelist.h"
00028 #include <math.h>
00029 
00030 #define HOTELLING 1  // If true use Hotelling's test to decide where to split.
00031 #define FTABLE_X 10  // Size of FTable.
00032 #define FTABLE_Y 100  // Size of FTable.
00033 
00034 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
00035 const double FTable[FTABLE_Y][FTABLE_X] = {
00036  {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
00037   {98.502,  99.000,  99.166,  99.249,  99.300,  99.333,  99.356,  99.374,  99.388,  99.399,},
00038   {34.116,  30.816,  29.457,  28.710,  28.237,  27.911,  27.672,  27.489,  27.345,  27.229,},
00039   {21.198,  18.000,  16.694,  15.977,  15.522,  15.207,  14.976,  14.799,  14.659,  14.546,},
00040   {16.258,  13.274,  12.060,  11.392,  10.967,  10.672,  10.456,  10.289,  10.158,  10.051,},
00041   {13.745,  10.925,   9.780,   9.148,   8.746,   8.466,   8.260,   8.102,   7.976,   7.874,},
00042   {12.246,   9.547,   8.451,   7.847,   7.460,   7.191,   6.993,   6.840,   6.719,   6.620,},
00043   {11.259,   8.649,   7.591,   7.006,   6.632,   6.371,   6.178,   6.029,   5.911,   5.814,},
00044   {10.561,   8.022,   6.992,   6.422,   6.057,   5.802,   5.613,   5.467,   5.351,   5.257,},
00045   {10.044,   7.559,   6.552,   5.994,   5.636,   5.386,   5.200,   5.057,   4.942,   4.849,},
00046   { 9.646,   7.206,   6.217,   5.668,   5.316,   5.069,   4.886,   4.744,   4.632,   4.539,},
00047   { 9.330,   6.927,   5.953,   5.412,   5.064,   4.821,   4.640,   4.499,   4.388,   4.296,},
00048   { 9.074,   6.701,   5.739,   5.205,   4.862,   4.620,   4.441,   4.302,   4.191,   4.100,},
00049   { 8.862,   6.515,   5.564,   5.035,   4.695,   4.456,   4.278,   4.140,   4.030,   3.939,},
00050   { 8.683,   6.359,   5.417,   4.893,   4.556,   4.318,   4.142,   4.004,   3.895,   3.805,},
00051   { 8.531,   6.226,   5.292,   4.773,   4.437,   4.202,   4.026,   3.890,   3.780,   3.691,},
00052   { 8.400,   6.112,   5.185,   4.669,   4.336,   4.102,   3.927,   3.791,   3.682,   3.593,},
00053   { 8.285,   6.013,   5.092,   4.579,   4.248,   4.015,   3.841,   3.705,   3.597,   3.508,},
00054   { 8.185,   5.926,   5.010,   4.500,   4.171,   3.939,   3.765,   3.631,   3.523,   3.434,},
00055   { 8.096,   5.849,   4.938,   4.431,   4.103,   3.871,   3.699,   3.564,   3.457,   3.368,},
00056   { 8.017,   5.780,   4.874,   4.369,   4.042,   3.812,   3.640,   3.506,   3.398,   3.310,},
00057   { 7.945,   5.719,   4.817,   4.313,   3.988,   3.758,   3.587,   3.453,   3.346,   3.258,},
00058   { 7.881,   5.664,   4.765,   4.264,   3.939,   3.710,   3.539,   3.406,   3.299,   3.211,},
00059   { 7.823,   5.614,   4.718,   4.218,   3.895,   3.667,   3.496,   3.363,   3.256,   3.168,},
00060   { 7.770,   5.568,   4.675,   4.177,   3.855,   3.627,   3.457,   3.324,   3.217,   3.129,},
00061   { 7.721,   5.526,   4.637,   4.140,   3.818,   3.591,   3.421,   3.288,   3.182,   3.094,},
00062   { 7.677,   5.488,   4.601,   4.106,   3.785,   3.558,   3.388,   3.256,   3.149,   3.062,},
00063   { 7.636,   5.453,   4.568,   4.074,   3.754,   3.528,   3.358,   3.226,   3.120,   3.032,},
00064   { 7.598,   5.420,   4.538,   4.045,   3.725,   3.499,   3.330,   3.198,   3.092,   3.005,},
00065   { 7.562,   5.390,   4.510,   4.018,   3.699,   3.473,   3.305,   3.173,   3.067,   2.979,},
00066   { 7.530,   5.362,   4.484,   3.993,   3.675,   3.449,   3.281,   3.149,   3.043,   2.955,},
00067   { 7.499,   5.336,   4.459,   3.969,   3.652,   3.427,   3.258,   3.127,   3.021,   2.934,},
00068   { 7.471,   5.312,   4.437,   3.948,   3.630,   3.406,   3.238,   3.106,   3.000,   2.913,},
00069   { 7.444,   5.289,   4.416,   3.927,   3.611,   3.386,   3.218,   3.087,   2.981,   2.894,},
00070   { 7.419,   5.268,   4.396,   3.908,   3.592,   3.368,   3.200,   3.069,   2.963,   2.876,},
00071   { 7.396,   5.248,   4.377,   3.890,   3.574,   3.351,   3.183,   3.052,   2.946,   2.859,},
00072   { 7.373,   5.229,   4.360,   3.873,   3.558,   3.334,   3.167,   3.036,   2.930,   2.843,},
00073   { 7.353,   5.211,   4.343,   3.858,   3.542,   3.319,   3.152,   3.021,   2.915,   2.828,},
00074   { 7.333,   5.194,   4.327,   3.843,   3.528,   3.305,   3.137,   3.006,   2.901,   2.814,},
00075   { 7.314,   5.179,   4.313,   3.828,   3.514,   3.291,   3.124,   2.993,   2.888,   2.801,},
00076   { 7.296,   5.163,   4.299,   3.815,   3.501,   3.278,   3.111,   2.980,   2.875,   2.788,},
00077   { 7.280,   5.149,   4.285,   3.802,   3.488,   3.266,   3.099,   2.968,   2.863,   2.776,},
00078   { 7.264,   5.136,   4.273,   3.790,   3.476,   3.254,   3.087,   2.957,   2.851,   2.764,},
00079   { 7.248,   5.123,   4.261,   3.778,   3.465,   3.243,   3.076,   2.946,   2.840,   2.754,},
00080   { 7.234,   5.110,   4.249,   3.767,   3.454,   3.232,   3.066,   2.935,   2.830,   2.743,},
00081   { 7.220,   5.099,   4.238,   3.757,   3.444,   3.222,   3.056,   2.925,   2.820,   2.733,},
00082   { 7.207,   5.087,   4.228,   3.747,   3.434,   3.213,   3.046,   2.916,   2.811,   2.724,},
00083   { 7.194,   5.077,   4.218,   3.737,   3.425,   3.204,   3.037,   2.907,   2.802,   2.715,},
00084   { 7.182,   5.066,   4.208,   3.728,   3.416,   3.195,   3.028,   2.898,   2.793,   2.706,},
00085   { 7.171,   5.057,   4.199,   3.720,   3.408,   3.186,   3.020,   2.890,   2.785,   2.698,},
00086   { 7.159,   5.047,   4.191,   3.711,   3.400,   3.178,   3.012,   2.882,   2.777,   2.690,},
00087   { 7.149,   5.038,   4.182,   3.703,   3.392,   3.171,   3.005,   2.874,   2.769,   2.683,},
00088   { 7.139,   5.030,   4.174,   3.695,   3.384,   3.163,   2.997,   2.867,   2.762,   2.675,},
00089   { 7.129,   5.021,   4.167,   3.688,   3.377,   3.156,   2.990,   2.860,   2.755,   2.668,},
00090   { 7.119,   5.013,   4.159,   3.681,   3.370,   3.149,   2.983,   2.853,   2.748,   2.662,},
00091   { 7.110,   5.006,   4.152,   3.674,   3.363,   3.143,   2.977,   2.847,   2.742,   2.655,},
00092   { 7.102,   4.998,   4.145,   3.667,   3.357,   3.136,   2.971,   2.841,   2.736,   2.649,},
00093   { 7.093,   4.991,   4.138,   3.661,   3.351,   3.130,   2.965,   2.835,   2.730,   2.643,},
00094   { 7.085,   4.984,   4.132,   3.655,   3.345,   3.124,   2.959,   2.829,   2.724,   2.637,},
00095   { 7.077,   4.977,   4.126,   3.649,   3.339,   3.119,   2.953,   2.823,   2.718,   2.632,},
00096   { 7.070,   4.971,   4.120,   3.643,   3.333,   3.113,   2.948,   2.818,   2.713,   2.626,},
00097   { 7.062,   4.965,   4.114,   3.638,   3.328,   3.108,   2.942,   2.813,   2.708,   2.621,},
00098   { 7.055,   4.959,   4.109,   3.632,   3.323,   3.103,   2.937,   2.808,   2.703,   2.616,},
00099   { 7.048,   4.953,   4.103,   3.627,   3.318,   3.098,   2.932,   2.803,   2.698,   2.611,},
00100   { 7.042,   4.947,   4.098,   3.622,   3.313,   3.093,   2.928,   2.798,   2.693,   2.607,},
00101   { 7.035,   4.942,   4.093,   3.618,   3.308,   3.088,   2.923,   2.793,   2.689,   2.602,},
00102   { 7.029,   4.937,   4.088,   3.613,   3.304,   3.084,   2.919,   2.789,   2.684,   2.598,},
00103   { 7.023,   4.932,   4.083,   3.608,   3.299,   3.080,   2.914,   2.785,   2.680,   2.593,},
00104   { 7.017,   4.927,   4.079,   3.604,   3.295,   3.075,   2.910,   2.781,   2.676,   2.589,},
00105   { 7.011,   4.922,   4.074,   3.600,   3.291,   3.071,   2.906,   2.777,   2.672,   2.585,},
00106   { 7.006,   4.917,   4.070,   3.596,   3.287,   3.067,   2.902,   2.773,   2.668,   2.581,},
00107   { 7.001,   4.913,   4.066,   3.591,   3.283,   3.063,   2.898,   2.769,   2.664,   2.578,},
00108   { 6.995,   4.908,   4.062,   3.588,   3.279,   3.060,   2.895,   2.765,   2.660,   2.574,},
00109   { 6.990,   4.904,   4.058,   3.584,   3.275,   3.056,   2.891,   2.762,   2.657,   2.570,},
00110   { 6.985,   4.900,   4.054,   3.580,   3.272,   3.052,   2.887,   2.758,   2.653,   2.567,},
00111   { 6.981,   4.896,   4.050,   3.577,   3.268,   3.049,   2.884,   2.755,   2.650,   2.563,},
00112   { 6.976,   4.892,   4.047,   3.573,   3.265,   3.046,   2.881,   2.751,   2.647,   2.560,},
00113   { 6.971,   4.888,   4.043,   3.570,   3.261,   3.042,   2.877,   2.748,   2.644,   2.557,},
00114   { 6.967,   4.884,   4.040,   3.566,   3.258,   3.039,   2.874,   2.745,   2.640,   2.554,},
00115   { 6.963,   4.881,   4.036,   3.563,   3.255,   3.036,   2.871,   2.742,   2.637,   2.551,},
00116   { 6.958,   4.877,   4.033,   3.560,   3.252,   3.033,   2.868,   2.739,   2.634,   2.548,},
00117   { 6.954,   4.874,   4.030,   3.557,   3.249,   3.030,   2.865,   2.736,   2.632,   2.545,},
00118   { 6.950,   4.870,   4.027,   3.554,   3.246,   3.027,   2.863,   2.733,   2.629,   2.542,},
00119   { 6.947,   4.867,   4.024,   3.551,   3.243,   3.025,   2.860,   2.731,   2.626,   2.539,},
00120   { 6.943,   4.864,   4.021,   3.548,   3.240,   3.022,   2.857,   2.728,   2.623,   2.537,},
00121   { 6.939,   4.861,   4.018,   3.545,   3.238,   3.019,   2.854,   2.725,   2.621,   2.534,},
00122   { 6.935,   4.858,   4.015,   3.543,   3.235,   3.017,   2.852,   2.723,   2.618,   2.532,},
00123   { 6.932,   4.855,   4.012,   3.540,   3.233,   3.014,   2.849,   2.720,   2.616,   2.529,},
00124   { 6.928,   4.852,   4.010,   3.538,   3.230,   3.012,   2.847,   2.718,   2.613,   2.527,},
00125   { 6.925,   4.849,   4.007,   3.535,   3.228,   3.009,   2.845,   2.715,   2.611,   2.524,},
00126   { 6.922,   4.846,   4.004,   3.533,   3.225,   3.007,   2.842,   2.713,   2.609,   2.522,},
00127   { 6.919,   4.844,   4.002,   3.530,   3.223,   3.004,   2.840,   2.711,   2.606,   2.520,},
00128   { 6.915,   4.841,   3.999,   3.528,   3.221,   3.002,   2.838,   2.709,   2.604,   2.518,},
00129   { 6.912,   4.838,   3.997,   3.525,   3.218,   3.000,   2.835,   2.706,   2.602,   2.515,},
00130   { 6.909,   4.836,   3.995,   3.523,   3.216,   2.998,   2.833,   2.704,   2.600,   2.513,},
00131   { 6.906,   4.833,   3.992,   3.521,   3.214,   2.996,   2.831,   2.702,   2.598,   2.511,},
00132   { 6.904,   4.831,   3.990,   3.519,   3.212,   2.994,   2.829,   2.700,   2.596,   2.509,},
00133   { 6.901,   4.829,   3.988,   3.517,   3.210,   2.992,   2.827,   2.698,   2.594,   2.507,},
00134   { 6.898,   4.826,   3.986,   3.515,   3.208,   2.990,   2.825,   2.696,   2.592,   2.505,},
00135   { 6.895,   4.824,   3.984,   3.513,   3.206,   2.988,   2.823,   2.694,   2.590,   2.503}
00136 };
00137 
00142 #define MINVARIANCE     0.0004
00143 
00150 #define MINSAMPLESPERBUCKET 5
00151 #define MINSAMPLES    (MINBUCKETS * MINSAMPLESPERBUCKET)
00152 #define MINSAMPLESNEEDED  1
00153 
00160 #define BUCKETTABLESIZE   1024
00161 #define NORMALEXTENT    3.0
00162 
00163 struct TEMPCLUSTER {
00164   CLUSTER *Cluster;
00165   CLUSTER *Neighbor;
00166 };
00167 
00168 typedef tesseract::KDPairInc<float, TEMPCLUSTER*> ClusterPair;
00169 typedef tesseract::GenericHeap<ClusterPair> ClusterHeap;
00170 
00171 struct STATISTICS {
00172   FLOAT32 AvgVariance;
00173   FLOAT32 *CoVariance;
00174   FLOAT32 *Min;                  // largest negative distance from the mean
00175   FLOAT32 *Max;                  // largest positive distance from the mean
00176 };
00177 
00178 struct BUCKETS {
00179   DISTRIBUTION Distribution;     // distribution being tested for
00180   uinT32 SampleCount;            // # of samples in histogram
00181   FLOAT64 Confidence;            // confidence level of test
00182   FLOAT64 ChiSquared;            // test threshold
00183   uinT16 NumberOfBuckets;        // number of cells in histogram
00184   uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
00185   uinT32 *Count;                 // frequency of occurrence histogram
00186   FLOAT32 *ExpectedCount;        // expected histogram
00187 };
00188 
00189 struct CHISTRUCT{
00190   uinT16 DegreesOfFreedom;
00191   FLOAT64 Alpha;
00192   FLOAT64 ChiSquared;
00193 };
00194 
00195 // For use with KDWalk / MakePotentialClusters
00196 struct ClusteringContext {
00197   ClusterHeap *heap;  // heap used to hold temp clusters, "best" on top
00198   TEMPCLUSTER *candidates;  // array of potential clusters
00199   KDTREE *tree;  // kd-tree to be searched for neighbors
00200   inT32 next;  // next candidate to be used
00201 };
00202 
00203 typedef FLOAT64 (*DENSITYFUNC) (inT32);
00204 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
00205 
00206 #define Odd(N) ((N)%2)
00207 #define Mirror(N,R) ((R) - (N) - 1)
00208 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
00209 
00210 //--------------Global Data Definitions and Declarations----------------------
00218 #define SqrtOf2Pi     2.506628275
00219 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
00220 static const FLOAT64 kNormalVariance =
00221     (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
00222 static const FLOAT64 kNormalMagnitude =
00223     (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
00224 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
00225 
00228 #define LOOKUPTABLESIZE   8
00229 #define MAXDEGREESOFFREEDOM MAXBUCKETS
00230 
00231 static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
00232   MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
00233 };  // number of samples
00234 
00235 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = {
00236   MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
00237 };  // number of buckets
00238 
00239 /*-------------------------------------------------------------------------
00240           Private Function Prototypes
00241 --------------------------------------------------------------------------*/
00242 void CreateClusterTree(CLUSTERER *Clusterer);
00243 
00244 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster,
00245                            inT32 Level);
00246 
00247 CLUSTER *FindNearestNeighbor(KDTREE *Tree,
00248                              CLUSTER *Cluster,
00249                              FLOAT32 *Distance);
00250 
00251 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
00252 
00253 inT32 MergeClusters (inT16 N,
00254 register PARAM_DESC ParamDesc[],
00255 register inT32 n1,
00256 register inT32 n2,
00257 register FLOAT32 m[],
00258 register FLOAT32 m1[], register FLOAT32 m2[]);
00259 
00260 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
00261 
00262 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
00263                          CLUSTERCONFIG *Config,
00264                          CLUSTER *Cluster);
00265 
00266 PROTOTYPE *MakeDegenerateProto(uinT16 N,
00267                                CLUSTER *Cluster,
00268                                STATISTICS *Statistics,
00269                                PROTOSTYLE Style,
00270                                inT32 MinSamples);
00271 
00272 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
00273                                CLUSTERCONFIG *Config,
00274                                CLUSTER *Cluster,
00275                                STATISTICS *Statistics);
00276 
00277 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
00278                               CLUSTER *Cluster,
00279                               STATISTICS *Statistics,
00280                               BUCKETS *Buckets);
00281 
00282 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
00283                                CLUSTER *Cluster,
00284                                STATISTICS *Statistics,
00285                                BUCKETS *Buckets);
00286 
00287 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
00288                           CLUSTER *Cluster,
00289                           STATISTICS *Statistics,
00290                           BUCKETS *NormalBuckets,
00291                           FLOAT64 Confidence);
00292 
00293 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
00294 
00295 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
00296 
00297 STATISTICS *ComputeStatistics (inT16 N,
00298 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
00299 
00300 PROTOTYPE *NewSphericalProto(uinT16 N,
00301                              CLUSTER *Cluster,
00302                              STATISTICS *Statistics);
00303 
00304 PROTOTYPE *NewEllipticalProto(inT16 N,
00305                               CLUSTER *Cluster,
00306                               STATISTICS *Statistics);
00307 
00308 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
00309 
00310 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
00311 
00312 BOOL8 Independent (PARAM_DESC ParamDesc[],
00313 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
00314 
00315 BUCKETS *GetBuckets(CLUSTERER* clusterer,
00316                     DISTRIBUTION Distribution,
00317                     uinT32 SampleCount,
00318                     FLOAT64 Confidence);
00319 
00320 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
00321                      uinT32 SampleCount,
00322                      FLOAT64 Confidence);
00323 
00324 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount);
00325 
00326 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
00327 
00328 FLOAT64 NormalDensity(inT32 x);
00329 
00330 FLOAT64 UniformDensity(inT32 x);
00331 
00332 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
00333 
00334 void FillBuckets(BUCKETS *Buckets,
00335                  CLUSTER *Cluster,
00336                  uinT16 Dim,
00337                  PARAM_DESC *ParamDesc,
00338                  FLOAT32 Mean,
00339                  FLOAT32 StdDev);
00340 
00341 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
00342                     FLOAT32 x,
00343                     FLOAT32 Mean,
00344                     FLOAT32 StdDev);
00345 
00346 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
00347                      FLOAT32 x,
00348                      FLOAT32 Mean,
00349                      FLOAT32 StdDev);
00350 
00351 BOOL8 DistributionOK(BUCKETS *Buckets);
00352 
00353 void FreeStatistics(STATISTICS *Statistics);
00354 
00355 void FreeBuckets(BUCKETS *Buckets);
00356 
00357 void FreeCluster(CLUSTER *Cluster);
00358 
00359 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
00360 
00361 int NumBucketsMatch(void *arg1,   // BUCKETS *Histogram,
00362                     void *arg2);  // uinT16 *DesiredNumberOfBuckets);
00363 
00364 int ListEntryMatch(void *arg1, void *arg2);
00365 
00366 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
00367 
00368 void InitBuckets(BUCKETS *Buckets);
00369 
00370 int AlphaMatch(void *arg1,   // CHISTRUCT *ChiStruct,
00371                void *arg2);  // CHISTRUCT *SearchKey);
00372 
00373 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
00374 
00375 FLOAT64 Solve(SOLVEFUNC Function,
00376               void *FunctionParams,
00377               FLOAT64 InitialGuess,
00378               FLOAT64 Accuracy);
00379 
00380 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
00381 
00382 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
00383                           CLUSTER *Cluster,
00384                           FLOAT32 MaxIllegal);
00385 
00386 double InvertMatrix(const float* input, int size, float* inv);
00387 
00388 //--------------------------Public Code--------------------------------------
00399 CLUSTERER *
00400 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
00401   CLUSTERER *Clusterer;
00402   int i;
00403 
00404   // allocate main clusterer data structure and init simple fields
00405   Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
00406   Clusterer->SampleSize = SampleSize;
00407   Clusterer->NumberOfSamples = 0;
00408   Clusterer->NumChar = 0;
00409 
00410   // init fields which will not be used initially
00411   Clusterer->Root = NULL;
00412   Clusterer->ProtoList = NIL_LIST;
00413 
00414   // maintain a copy of param descriptors in the clusterer data structure
00415   Clusterer->ParamDesc =
00416     (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
00417   for (i = 0; i < SampleSize; i++) {
00418     Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
00419     Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
00420     Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
00421     Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
00422     Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
00423     Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
00424     Clusterer->ParamDesc[i].MidRange =
00425       (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
00426   }
00427 
00428   // allocate a kd tree to hold the samples
00429   Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
00430 
00431   // Initialize cache of histogram buckets to minimize recomputing them.
00432   for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
00433     for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
00434       Clusterer->bucket_cache[d][c] = NULL;
00435   }
00436 
00437   return Clusterer;
00438 }                                // MakeClusterer
00439 
00440 
00457 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
00458                    inT32 CharID) {
00459   SAMPLE *Sample;
00460   int i;
00461 
00462   // see if the samples have already been clustered - if so trap an error
00463   if (Clusterer->Root != NULL)
00464     DoError (ALREADYCLUSTERED,
00465       "Can't add samples after they have been clustered");
00466 
00467   // allocate the new sample and initialize it
00468   Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
00469     (Clusterer->SampleSize -
00470     1) * sizeof (FLOAT32));
00471   Sample->Clustered = FALSE;
00472   Sample->Prototype = FALSE;
00473   Sample->SampleCount = 1;
00474   Sample->Left = NULL;
00475   Sample->Right = NULL;
00476   Sample->CharID = CharID;
00477 
00478   for (i = 0; i < Clusterer->SampleSize; i++)
00479     Sample->Mean[i] = Feature[i];
00480 
00481   // add the sample to the KD tree - keep track of the total # of samples
00482   Clusterer->NumberOfSamples++;
00483   KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
00484   if (CharID >= Clusterer->NumChar)
00485     Clusterer->NumChar = CharID + 1;
00486 
00487   // execute hook for monitoring clustering operation
00488   // (*SampleCreationHook)( Sample );
00489 
00490   return (Sample);
00491 }                                // MakeSample
00492 
00493 
00515 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00516   //only create cluster tree if samples have never been clustered before
00517   if (Clusterer->Root == NULL)
00518     CreateClusterTree(Clusterer);
00519 
00520   //deallocate the old prototype list if one exists
00521   FreeProtoList (&Clusterer->ProtoList);
00522   Clusterer->ProtoList = NIL_LIST;
00523 
00524   //compute prototypes starting at the root node in the tree
00525   ComputePrototypes(Clusterer, Config);
00526   return (Clusterer->ProtoList);
00527 }                                // ClusterSamples
00528 
00529 
00543 void FreeClusterer(CLUSTERER *Clusterer) {
00544   if (Clusterer != NULL) {
00545     memfree (Clusterer->ParamDesc);
00546     if (Clusterer->KDTree != NULL)
00547       FreeKDTree (Clusterer->KDTree);
00548     if (Clusterer->Root != NULL)
00549       FreeCluster (Clusterer->Root);
00550     // Free up all used buckets structures.
00551     for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
00552       for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
00553         if (Clusterer->bucket_cache[d][c] != NULL)
00554           FreeBuckets(Clusterer->bucket_cache[d][c]);
00555     }
00556 
00557     memfree(Clusterer);
00558   }
00559 }                                // FreeClusterer
00560 
00561 
00571 void FreeProtoList(LIST *ProtoList) {
00572   destroy_nodes(*ProtoList, FreePrototype);
00573 }                                // FreeProtoList
00574 
00575 
00586 void FreePrototype(void *arg) {  //PROTOTYPE     *Prototype)
00587   PROTOTYPE *Prototype = (PROTOTYPE *) arg;
00588 
00589   // unmark the corresponding cluster (if there is one
00590   if (Prototype->Cluster != NULL)
00591     Prototype->Cluster->Prototype = FALSE;
00592 
00593   // deallocate the prototype statistics and then the prototype itself
00594   if (Prototype->Distrib != NULL)
00595     memfree (Prototype->Distrib);
00596   if (Prototype->Mean != NULL)
00597     memfree (Prototype->Mean);
00598   if (Prototype->Style != spherical) {
00599     if (Prototype->Variance.Elliptical != NULL)
00600       memfree (Prototype->Variance.Elliptical);
00601     if (Prototype->Magnitude.Elliptical != NULL)
00602       memfree (Prototype->Magnitude.Elliptical);
00603     if (Prototype->Weight.Elliptical != NULL)
00604       memfree (Prototype->Weight.Elliptical);
00605   }
00606   memfree(Prototype);
00607 }                                // FreePrototype
00608 
00609 
00625 CLUSTER *NextSample(LIST *SearchState) {
00626   CLUSTER *Cluster;
00627 
00628   if (*SearchState == NIL_LIST)
00629     return (NULL);
00630   Cluster = (CLUSTER *) first_node (*SearchState);
00631   *SearchState = pop (*SearchState);
00632   while (TRUE) {
00633     if (Cluster->Left == NULL)
00634       return (Cluster);
00635     *SearchState = push (*SearchState, Cluster->Right);
00636     Cluster = Cluster->Left;
00637   }
00638 }                                // NextSample
00639 
00640 
00650 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
00651   return (Proto->Mean[Dimension]);
00652 }                                // Mean
00653 
00654 
00664 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
00665   switch (Proto->Style) {
00666     case spherical:
00667       return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
00668     case elliptical:
00669       return ((FLOAT32)
00670         sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00671     case mixed:
00672       switch (Proto->Distrib[Dimension]) {
00673         case normal:
00674           return ((FLOAT32)
00675             sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00676         case uniform:
00677         case D_random:
00678           return (Proto->Variance.Elliptical[Dimension]);
00679         case DISTRIBUTION_COUNT:
00680           ASSERT_HOST(!"Distribution count not allowed!");
00681       }
00682   }
00683   return 0.0f;
00684 }                                // StandardDeviation
00685 
00686 
00687 /*---------------------------------------------------------------------------
00688             Private Code
00689 ----------------------------------------------------------------------------*/
00705 void CreateClusterTree(CLUSTERER *Clusterer) {
00706   ClusteringContext context;
00707   ClusterPair HeapEntry;
00708   TEMPCLUSTER *PotentialCluster;
00709 
00710   // each sample and its nearest neighbor form a "potential" cluster
00711   // save these in a heap with the "best" potential clusters on top
00712   context.tree = Clusterer->KDTree;
00713   context.candidates = (TEMPCLUSTER *)
00714     Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
00715   context.next = 0;
00716   context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
00717   KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
00718 
00719   // form potential clusters into actual clusters - always do "best" first
00720   while (context.heap->Pop(&HeapEntry)) {
00721     PotentialCluster = HeapEntry.data;
00722 
00723     // if main cluster of potential cluster is already in another cluster
00724     // then we don't need to worry about it
00725     if (PotentialCluster->Cluster->Clustered) {
00726       continue;
00727     }
00728 
00729     // if main cluster is not yet clustered, but its nearest neighbor is
00730     // then we must find a new nearest neighbor
00731     else if (PotentialCluster->Neighbor->Clustered) {
00732       PotentialCluster->Neighbor =
00733         FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
00734                             &HeapEntry.key);
00735       if (PotentialCluster->Neighbor != NULL) {
00736         context.heap->Push(&HeapEntry);
00737       }
00738     }
00739 
00740     // if neither cluster is already clustered, form permanent cluster
00741     else {
00742       PotentialCluster->Cluster =
00743           MakeNewCluster(Clusterer, PotentialCluster);
00744       PotentialCluster->Neighbor =
00745           FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
00746                               &HeapEntry.key);
00747       if (PotentialCluster->Neighbor != NULL) {
00748         context.heap->Push(&HeapEntry);
00749       }
00750     }
00751   }
00752 
00753   // the root node in the cluster tree is now the only node in the kd-tree
00754   Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
00755 
00756   // free up the memory used by the K-D tree, heap, and temp clusters
00757   FreeKDTree(context.tree);
00758   Clusterer->KDTree = NULL;
00759   delete context.heap;
00760   memfree(context.candidates);
00761 }                                // CreateClusterTree
00762 
00763 
00773 void MakePotentialClusters(ClusteringContext *context,
00774                            CLUSTER *Cluster, inT32 Level) {
00775   ClusterPair HeapEntry;
00776   int next = context->next;
00777   context->candidates[next].Cluster = Cluster;
00778   HeapEntry.data = &(context->candidates[next]);
00779   context->candidates[next].Neighbor =
00780       FindNearestNeighbor(context->tree,
00781                           context->candidates[next].Cluster,
00782                           &HeapEntry.key);
00783   if (context->candidates[next].Neighbor != NULL) {
00784     context->heap->Push(&HeapEntry);
00785     context->next++;
00786   }
00787 }                                // MakePotentialClusters
00788 
00789 
00806 CLUSTER *
00807 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
00808 #define MAXNEIGHBORS  2
00809 #define MAXDISTANCE   MAX_FLOAT32
00810 {
00811   CLUSTER *Neighbor[MAXNEIGHBORS];
00812   FLOAT32 Dist[MAXNEIGHBORS];
00813   int NumberOfNeighbors;
00814   inT32 i;
00815   CLUSTER *BestNeighbor;
00816 
00817   // find the 2 nearest neighbors of the cluster
00818   KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE,
00819                           &NumberOfNeighbors, (void **)Neighbor, Dist);
00820 
00821   // search for the nearest neighbor that is not the cluster itself
00822   *Distance = MAXDISTANCE;
00823   BestNeighbor = NULL;
00824   for (i = 0; i < NumberOfNeighbors; i++) {
00825     if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
00826       *Distance = Dist[i];
00827       BestNeighbor = Neighbor[i];
00828     }
00829   }
00830   return BestNeighbor;
00831 }                                // FindNearestNeighbor
00832 
00833 
00846 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
00847   CLUSTER *Cluster;
00848 
00849   // allocate the new cluster and initialize it
00850   Cluster = (CLUSTER *) Emalloc(
00851       sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
00852   Cluster->Clustered = FALSE;
00853   Cluster->Prototype = FALSE;
00854   Cluster->Left = TempCluster->Cluster;
00855   Cluster->Right = TempCluster->Neighbor;
00856   Cluster->CharID = -1;
00857 
00858   // mark the old clusters as "clustered" and delete them from the kd-tree
00859   Cluster->Left->Clustered = TRUE;
00860   Cluster->Right->Clustered = TRUE;
00861   KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
00862   KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
00863 
00864   // compute the mean and sample count for the new cluster
00865   Cluster->SampleCount =
00866       MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
00867                     Cluster->Left->SampleCount, Cluster->Right->SampleCount,
00868                     Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
00869 
00870   // add the new cluster to the KD tree
00871   KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
00872   return Cluster;
00873 }                                // MakeNewCluster
00874 
00875 
00891 inT32 MergeClusters(inT16 N,
00892                     PARAM_DESC ParamDesc[],
00893                     inT32 n1,
00894                     inT32 n2,
00895                     FLOAT32 m[],
00896                     FLOAT32 m1[], FLOAT32 m2[]) {
00897   inT32 i, n;
00898 
00899   n = n1 + n2;
00900   for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
00901     if (ParamDesc->Circular) {
00902       // if distance between means is greater than allowed
00903       // reduce upper point by one "rotation" to compute mean
00904       // then normalize the mean back into the accepted range
00905       if ((*m2 - *m1) > ParamDesc->HalfRange) {
00906         *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
00907         if (*m < ParamDesc->Min)
00908           *m += ParamDesc->Range;
00909       }
00910       else if ((*m1 - *m2) > ParamDesc->HalfRange) {
00911         *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
00912         if (*m < ParamDesc->Min)
00913           *m += ParamDesc->Range;
00914       }
00915       else
00916         *m = (n1 * *m1 + n2 * *m2) / n;
00917     }
00918     else
00919       *m = (n1 * *m1 + n2 * *m2) / n;
00920   }
00921   return n;
00922 }                                // MergeClusters
00923 
00924 
00936 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00937   LIST ClusterStack = NIL_LIST;
00938   CLUSTER *Cluster;
00939   PROTOTYPE *Prototype;
00940 
00941   // use a stack to keep track of clusters waiting to be processed
00942   // initially the only cluster on the stack is the root cluster
00943   if (Clusterer->Root != NULL)
00944     ClusterStack = push (NIL_LIST, Clusterer->Root);
00945 
00946   // loop until we have analyzed all clusters which are potential prototypes
00947   while (ClusterStack != NIL_LIST) {
00948     // remove the next cluster to be analyzed from the stack
00949     // try to make a prototype from the cluster
00950     // if successful, put it on the proto list, else split the cluster
00951     Cluster = (CLUSTER *) first_node (ClusterStack);
00952     ClusterStack = pop (ClusterStack);
00953     Prototype = MakePrototype(Clusterer, Config, Cluster);
00954     if (Prototype != NULL) {
00955       Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
00956     }
00957     else {
00958       ClusterStack = push (ClusterStack, Cluster->Right);
00959       ClusterStack = push (ClusterStack, Cluster->Left);
00960     }
00961   }
00962 }                                // ComputePrototypes
00963 
00964 
00982 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
00983                          CLUSTERCONFIG *Config,
00984                          CLUSTER *Cluster) {
00985   STATISTICS *Statistics;
00986   PROTOTYPE *Proto;
00987   BUCKETS *Buckets;
00988 
00989   // filter out clusters which contain samples from the same character
00990   if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
00991     return NULL;
00992 
00993   // compute the covariance matrix and ranges for the cluster
00994   Statistics =
00995       ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
00996 
00997   // check for degenerate clusters which need not be analyzed further
00998   // note that the MinSamples test assumes that all clusters with multiple
00999   // character samples have been removed (as above)
01000   Proto = MakeDegenerateProto(
01001       Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
01002       (inT32) (Config->MinSamples * Clusterer->NumChar));
01003   if (Proto != NULL) {
01004     FreeStatistics(Statistics);
01005     return Proto;
01006   }
01007   // check to ensure that all dimensions are independent
01008   if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
01009                    Statistics->CoVariance, Config->Independence)) {
01010     FreeStatistics(Statistics);
01011     return NULL;
01012   }
01013 
01014   if (HOTELLING && Config->ProtoStyle == elliptical) {
01015     Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
01016     if (Proto != NULL) {
01017       FreeStatistics(Statistics);
01018       return Proto;
01019     }
01020   }
01021 
01022   // create a histogram data structure used to evaluate distributions
01023   Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
01024                        Config->Confidence);
01025 
01026   // create a prototype based on the statistics and test it
01027   switch (Config->ProtoStyle) {
01028     case spherical:
01029       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
01030       break;
01031     case elliptical:
01032       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
01033       break;
01034     case mixed:
01035       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
01036                              Config->Confidence);
01037       break;
01038     case automatic:
01039       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
01040       if (Proto != NULL)
01041         break;
01042       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
01043       if (Proto != NULL)
01044         break;
01045       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
01046                              Config->Confidence);
01047       break;
01048   }
01049   FreeStatistics(Statistics);
01050   return Proto;
01051 }                                // MakePrototype
01052 
01053 
01077 PROTOTYPE *MakeDegenerateProto(  //this was MinSample
01078                                uinT16 N,
01079                                CLUSTER *Cluster,
01080                                STATISTICS *Statistics,
01081                                PROTOSTYLE Style,
01082                                inT32 MinSamples) {
01083   PROTOTYPE *Proto = NULL;
01084 
01085   if (MinSamples < MINSAMPLESNEEDED)
01086     MinSamples = MINSAMPLESNEEDED;
01087 
01088   if (Cluster->SampleCount < MinSamples) {
01089     switch (Style) {
01090       case spherical:
01091         Proto = NewSphericalProto (N, Cluster, Statistics);
01092         break;
01093       case elliptical:
01094       case automatic:
01095         Proto = NewEllipticalProto (N, Cluster, Statistics);
01096         break;
01097       case mixed:
01098         Proto = NewMixedProto (N, Cluster, Statistics);
01099         break;
01100     }
01101     Proto->Significant = FALSE;
01102   }
01103   return (Proto);
01104 }                                // MakeDegenerateProto
01105 
01119 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
01120                                CLUSTERCONFIG *Config,
01121                                CLUSTER *Cluster,
01122                                STATISTICS *Statistics) {
01123   // Fraction of the number of samples used as a range around 1 within
01124   // which a cluster has the magic size that allows a boost to the
01125   // FTable by kFTableBoostMargin, thus allowing clusters near the
01126   // magic size (equal to the number of sample characters) to be more
01127   // likely to stay together.
01128   const double kMagicSampleMargin = 0.0625;
01129   const double kFTableBoostMargin = 2.0;
01130 
01131   int N = Clusterer->SampleSize;
01132   CLUSTER* Left = Cluster->Left;
01133   CLUSTER* Right = Cluster->Right;
01134   if (Left == NULL || Right == NULL)
01135     return NULL;
01136   int TotalDims = Left->SampleCount + Right->SampleCount;
01137   if (TotalDims < N + 1 || TotalDims < 2)
01138     return NULL;
01139   const int kMatrixSize = N * N * sizeof(FLOAT32);
01140   FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
01141   FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
01142   FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
01143   // Compute a new covariance matrix that only uses essential features.
01144   for (int i = 0; i < N; ++i) {
01145     int row_offset = i * N;
01146     if (!Clusterer->ParamDesc[i].NonEssential) {
01147       for (int j = 0; j < N; ++j) {
01148         if (!Clusterer->ParamDesc[j].NonEssential)
01149           Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
01150         else
01151           Covariance[j + row_offset] = 0.0f;
01152       }
01153     } else {
01154       for (int j = 0; j < N; ++j) {
01155         if (i == j)
01156           Covariance[j + row_offset] = 1.0f;
01157         else
01158           Covariance[j + row_offset] = 0.0f;
01159       }
01160     }
01161   }
01162   double err = InvertMatrix(Covariance, N, Inverse);
01163   if (err > 1) {
01164     tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
01165   }
01166   int EssentialN = 0;
01167   for (int dim = 0; dim < N; ++dim) {
01168     if (!Clusterer->ParamDesc[dim].NonEssential) {
01169       Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
01170       ++EssentialN;
01171     } else {
01172       Delta[dim] = 0.0f;
01173     }
01174   }
01175   // Compute Hotelling's T-squared.
01176   double Tsq = 0.0;
01177   for (int x = 0; x < N; ++x) {
01178     double temp = 0.0;
01179     for (int y = 0; y < N; ++y) {
01180       temp += Inverse[y + N*x] * Delta[y];
01181     }
01182     Tsq += Delta[x] * temp;
01183   }
01184   memfree(Covariance);
01185   memfree(Inverse);
01186   memfree(Delta);
01187   // Changed this function to match the formula in
01188   // Statistical Methods in Medical Research p 473
01189   // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
01190   // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
01191   double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
01192   int Fx = EssentialN;
01193   if (Fx > FTABLE_X)
01194     Fx = FTABLE_X;
01195   --Fx;
01196   int Fy = TotalDims - EssentialN - 1;
01197   if (Fy > FTABLE_Y)
01198     Fy = FTABLE_Y;
01199   --Fy;
01200   double FTarget = FTable[Fy][Fx];
01201   if (Config->MagicSamples > 0 &&
01202       TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
01203       TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
01204     // Give magic-sized clusters a magic FTable boost.
01205     FTarget += kFTableBoostMargin;
01206   }
01207   if (F < FTarget) {
01208     return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01209   }
01210   return NULL;
01211 }
01212 
01226 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
01227                               CLUSTER *Cluster,
01228                               STATISTICS *Statistics,
01229                               BUCKETS *Buckets) {
01230   PROTOTYPE *Proto = NULL;
01231   int i;
01232 
01233   // check that each dimension is a normal distribution
01234   for (i = 0; i < Clusterer->SampleSize; i++) {
01235     if (Clusterer->ParamDesc[i].NonEssential)
01236       continue;
01237 
01238     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01239       Cluster->Mean[i],
01240       sqrt ((FLOAT64) (Statistics->AvgVariance)));
01241     if (!DistributionOK (Buckets))
01242       break;
01243   }
01244   // if all dimensions matched a normal distribution, make a proto
01245   if (i >= Clusterer->SampleSize)
01246     Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
01247   return (Proto);
01248 }                                // MakeSphericalProto
01249 
01250 
01264 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
01265                                CLUSTER *Cluster,
01266                                STATISTICS *Statistics,
01267                                BUCKETS *Buckets) {
01268   PROTOTYPE *Proto = NULL;
01269   int i;
01270 
01271   // check that each dimension is a normal distribution
01272   for (i = 0; i < Clusterer->SampleSize; i++) {
01273     if (Clusterer->ParamDesc[i].NonEssential)
01274       continue;
01275 
01276     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01277       Cluster->Mean[i],
01278       sqrt ((FLOAT64) Statistics->
01279       CoVariance[i * (Clusterer->SampleSize + 1)]));
01280     if (!DistributionOK (Buckets))
01281       break;
01282   }
01283   // if all dimensions matched a normal distribution, make a proto
01284   if (i >= Clusterer->SampleSize)
01285     Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01286   return (Proto);
01287 }                                // MakeEllipticalProto
01288 
01289 
01307 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
01308                           CLUSTER *Cluster,
01309                           STATISTICS *Statistics,
01310                           BUCKETS *NormalBuckets,
01311                           FLOAT64 Confidence) {
01312   PROTOTYPE *Proto;
01313   int i;
01314   BUCKETS *UniformBuckets = NULL;
01315   BUCKETS *RandomBuckets = NULL;
01316 
01317   // create a mixed proto to work on - initially assume all dimensions normal*/
01318   Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
01319 
01320   // find the proper distribution for each dimension
01321   for (i = 0; i < Clusterer->SampleSize; i++) {
01322     if (Clusterer->ParamDesc[i].NonEssential)
01323       continue;
01324 
01325     FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01326       Proto->Mean[i],
01327       sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
01328     if (DistributionOK (NormalBuckets))
01329       continue;
01330 
01331     if (RandomBuckets == NULL)
01332       RandomBuckets =
01333         GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
01334     MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
01335     FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01336       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01337     if (DistributionOK (RandomBuckets))
01338       continue;
01339 
01340     if (UniformBuckets == NULL)
01341       UniformBuckets =
01342         GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
01343     MakeDimUniform(i, Proto, Statistics);
01344     FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01345       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01346     if (DistributionOK (UniformBuckets))
01347       continue;
01348     break;
01349   }
01350   // if any dimension failed to match a distribution, discard the proto
01351   if (i < Clusterer->SampleSize) {
01352     FreePrototype(Proto);
01353     Proto = NULL;
01354   }
01355   return (Proto);
01356 }                                // MakeMixedProto
01357 
01358 
01369 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
01370   Proto->Distrib[i] = D_random;
01371   Proto->Mean[i] = ParamDesc->MidRange;
01372   Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
01373 
01374   // subtract out the previous magnitude of this dimension from the total
01375   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01376   Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
01377   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01378   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01379 
01380   // note that the proto Weight is irrelevant for D_random protos
01381 }                                // MakeDimRandom
01382 
01383 
01394 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
01395   Proto->Distrib[i] = uniform;
01396   Proto->Mean[i] = Proto->Cluster->Mean[i] +
01397     (Statistics->Min[i] + Statistics->Max[i]) / 2;
01398   Proto->Variance.Elliptical[i] =
01399     (Statistics->Max[i] - Statistics->Min[i]) / 2;
01400   if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01401     Proto->Variance.Elliptical[i] = MINVARIANCE;
01402 
01403   // subtract out the previous magnitude of this dimension from the total
01404   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01405   Proto->Magnitude.Elliptical[i] =
01406     1.0 / (2.0 * Proto->Variance.Elliptical[i]);
01407   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01408   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01409 
01410   // note that the proto Weight is irrelevant for uniform protos
01411 }                                // MakeDimUniform
01412 
01413 
01430 STATISTICS *
01431 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
01432   STATISTICS *Statistics;
01433   int i, j;
01434   FLOAT32 *CoVariance;
01435   FLOAT32 *Distance;
01436   LIST SearchState;
01437   SAMPLE *Sample;
01438   uinT32 SampleCountAdjustedForBias;
01439 
01440   // allocate memory to hold the statistics results
01441   Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
01442   Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
01443   Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01444   Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01445 
01446   // allocate temporary memory to hold the sample to mean distances
01447   Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01448 
01449   // initialize the statistics
01450   Statistics->AvgVariance = 1.0;
01451   CoVariance = Statistics->CoVariance;
01452   for (i = 0; i < N; i++) {
01453     Statistics->Min[i] = 0.0;
01454     Statistics->Max[i] = 0.0;
01455     for (j = 0; j < N; j++, CoVariance++)
01456       *CoVariance = 0;
01457   }
01458   // find each sample in the cluster and merge it into the statistics
01459   InitSampleSearch(SearchState, Cluster);
01460   while ((Sample = NextSample (&SearchState)) != NULL) {
01461     for (i = 0; i < N; i++) {
01462       Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
01463       if (ParamDesc[i].Circular) {
01464         if (Distance[i] > ParamDesc[i].HalfRange)
01465           Distance[i] -= ParamDesc[i].Range;
01466         if (Distance[i] < -ParamDesc[i].HalfRange)
01467           Distance[i] += ParamDesc[i].Range;
01468       }
01469       if (Distance[i] < Statistics->Min[i])
01470         Statistics->Min[i] = Distance[i];
01471       if (Distance[i] > Statistics->Max[i])
01472         Statistics->Max[i] = Distance[i];
01473     }
01474     CoVariance = Statistics->CoVariance;
01475     for (i = 0; i < N; i++)
01476       for (j = 0; j < N; j++, CoVariance++)
01477         *CoVariance += Distance[i] * Distance[j];
01478   }
01479   // normalize the variances by the total number of samples
01480   // use SampleCount-1 instead of SampleCount to get an unbiased estimate
01481   // also compute the geometic mean of the diagonal variances
01482   // ensure that clusters with only 1 sample are handled correctly
01483   if (Cluster->SampleCount > 1)
01484     SampleCountAdjustedForBias = Cluster->SampleCount - 1;
01485   else
01486     SampleCountAdjustedForBias = 1;
01487   CoVariance = Statistics->CoVariance;
01488   for (i = 0; i < N; i++)
01489   for (j = 0; j < N; j++, CoVariance++) {
01490     *CoVariance /= SampleCountAdjustedForBias;
01491     if (j == i) {
01492       if (*CoVariance < MINVARIANCE)
01493         *CoVariance = MINVARIANCE;
01494       Statistics->AvgVariance *= *CoVariance;
01495     }
01496   }
01497   Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
01498                                        1.0 / N);
01499 
01500   // release temporary memory and return
01501   memfree(Distance);
01502   return (Statistics);
01503 }                                // ComputeStatistics
01504 
01505 
01519 PROTOTYPE *NewSphericalProto(uinT16 N,
01520                              CLUSTER *Cluster,
01521                              STATISTICS *Statistics) {
01522   PROTOTYPE *Proto;
01523 
01524   Proto = NewSimpleProto (N, Cluster);
01525 
01526   Proto->Variance.Spherical = Statistics->AvgVariance;
01527   if (Proto->Variance.Spherical < MINVARIANCE)
01528     Proto->Variance.Spherical = MINVARIANCE;
01529 
01530   Proto->Magnitude.Spherical =
01531     1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
01532   Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
01533                                      (double) N);
01534   Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
01535   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01536 
01537   return (Proto);
01538 }                                // NewSphericalProto
01539 
01540 
01553 PROTOTYPE *NewEllipticalProto(inT16 N,
01554                               CLUSTER *Cluster,
01555                               STATISTICS *Statistics) {
01556   PROTOTYPE *Proto;
01557   FLOAT32 *CoVariance;
01558   int i;
01559 
01560   Proto = NewSimpleProto (N, Cluster);
01561   Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01562   Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01563   Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01564 
01565   CoVariance = Statistics->CoVariance;
01566   Proto->TotalMagnitude = 1.0;
01567   for (i = 0; i < N; i++, CoVariance += N + 1) {
01568     Proto->Variance.Elliptical[i] = *CoVariance;
01569     if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01570       Proto->Variance.Elliptical[i] = MINVARIANCE;
01571 
01572     Proto->Magnitude.Elliptical[i] =
01573       1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
01574     Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
01575     Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01576   }
01577   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01578   Proto->Style = elliptical;
01579   return (Proto);
01580 }                                // NewEllipticalProto
01581 
01582 
01598 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
01599   PROTOTYPE *Proto;
01600   int i;
01601 
01602   Proto = NewEllipticalProto (N, Cluster, Statistics);
01603   Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
01604 
01605   for (i = 0; i < N; i++) {
01606     Proto->Distrib[i] = normal;
01607   }
01608   Proto->Style = mixed;
01609   return (Proto);
01610 }                                // NewMixedProto
01611 
01612 
01623 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
01624   PROTOTYPE *Proto;
01625   int i;
01626 
01627   Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
01628   Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01629 
01630   for (i = 0; i < N; i++)
01631     Proto->Mean[i] = Cluster->Mean[i];
01632   Proto->Distrib = NULL;
01633 
01634   Proto->Significant = TRUE;
01635   Proto->Merged = FALSE;
01636   Proto->Style = spherical;
01637   Proto->NumSamples = Cluster->SampleCount;
01638   Proto->Cluster = Cluster;
01639   Proto->Cluster->Prototype = TRUE;
01640   return (Proto);
01641 }                                // NewSimpleProto
01642 
01643 
01664 BOOL8
01665 Independent (PARAM_DESC ParamDesc[],
01666 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
01667   int i, j;
01668   FLOAT32 *VARii;                // points to ith on-diagonal element
01669   FLOAT32 *VARjj;                // points to jth on-diagonal element
01670   FLOAT32 CorrelationCoeff;
01671 
01672   VARii = CoVariance;
01673   for (i = 0; i < N; i++, VARii += N + 1) {
01674     if (ParamDesc[i].NonEssential)
01675       continue;
01676 
01677     VARjj = VARii + N + 1;
01678     CoVariance = VARii + 1;
01679     for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
01680       if (ParamDesc[j].NonEssential)
01681         continue;
01682 
01683       if ((*VARii == 0.0) || (*VARjj == 0.0))
01684         CorrelationCoeff = 0.0;
01685       else
01686         CorrelationCoeff =
01687           sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
01688       if (CorrelationCoeff > Independence)
01689         return (FALSE);
01690     }
01691   }
01692   return (TRUE);
01693 }                                // Independent
01694 
01695 
01713 BUCKETS *GetBuckets(CLUSTERER* clusterer,
01714                     DISTRIBUTION Distribution,
01715                     uinT32 SampleCount,
01716                     FLOAT64 Confidence) {
01717   // Get an old bucket structure with the same number of buckets.
01718   uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
01719   BUCKETS *Buckets =
01720       clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
01721 
01722   // If a matching bucket structure is not found, make one and save it.
01723   if (Buckets == NULL) {
01724     Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
01725     clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
01726         Buckets;
01727   } else {
01728     // Just adjust the existing buckets.
01729     if (SampleCount != Buckets->SampleCount)
01730       AdjustBuckets(Buckets, SampleCount);
01731     if (Confidence != Buckets->Confidence) {
01732       Buckets->Confidence = Confidence;
01733       Buckets->ChiSquared = ComputeChiSquared(
01734           DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
01735           Confidence);
01736     }
01737     InitBuckets(Buckets);
01738   }
01739   return Buckets;
01740 }                                // GetBuckets
01741 
01742 
01761 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
01762                      uinT32 SampleCount,
01763                      FLOAT64 Confidence) {
01764   const DENSITYFUNC DensityFunction[] =
01765     { NormalDensity, UniformDensity, UniformDensity };
01766   int i, j;
01767   BUCKETS *Buckets;
01768   FLOAT64 BucketProbability;
01769   FLOAT64 NextBucketBoundary;
01770   FLOAT64 Probability;
01771   FLOAT64 ProbabilityDelta;
01772   FLOAT64 LastProbDensity;
01773   FLOAT64 ProbDensity;
01774   uinT16 CurrentBucket;
01775   BOOL8 Symmetrical;
01776 
01777   // allocate memory needed for data structure
01778   Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
01779   Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
01780   Buckets->SampleCount = SampleCount;
01781   Buckets->Confidence = Confidence;
01782   Buckets->Count = reinterpret_cast<uinT32*>(
01783       Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
01784   Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
01785       Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
01786 
01787   // initialize simple fields
01788   Buckets->Distribution = Distribution;
01789   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
01790     Buckets->Count[i] = 0;
01791     Buckets->ExpectedCount[i] = 0.0;
01792   }
01793 
01794   // all currently defined distributions are symmetrical
01795   Symmetrical = TRUE;
01796   Buckets->ChiSquared = ComputeChiSquared(
01797       DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
01798 
01799   if (Symmetrical) {
01800     // allocate buckets so that all have approx. equal probability
01801     BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
01802 
01803     // distribution is symmetric so fill in upper half then copy
01804     CurrentBucket = Buckets->NumberOfBuckets / 2;
01805     if (Odd (Buckets->NumberOfBuckets))
01806       NextBucketBoundary = BucketProbability / 2;
01807     else
01808       NextBucketBoundary = BucketProbability;
01809 
01810     Probability = 0.0;
01811     LastProbDensity =
01812       (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
01813     for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
01814       ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
01815       ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
01816       Probability += ProbabilityDelta;
01817       if (Probability > NextBucketBoundary) {
01818         if (CurrentBucket < Buckets->NumberOfBuckets - 1)
01819           CurrentBucket++;
01820         NextBucketBoundary += BucketProbability;
01821       }
01822       Buckets->Bucket[i] = CurrentBucket;
01823       Buckets->ExpectedCount[CurrentBucket] +=
01824         (FLOAT32) (ProbabilityDelta * SampleCount);
01825       LastProbDensity = ProbDensity;
01826     }
01827     // place any leftover probability into the last bucket
01828     Buckets->ExpectedCount[CurrentBucket] +=
01829       (FLOAT32) ((0.5 - Probability) * SampleCount);
01830 
01831     // copy upper half of distribution to lower half
01832     for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
01833       Buckets->Bucket[i] =
01834         Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
01835 
01836     // copy upper half of expected counts to lower half
01837     for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
01838       Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
01839   }
01840   return Buckets;
01841 }                                // MakeBuckets
01842 
01843 
01859 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) {
01860   uinT8 Last, Next;
01861   FLOAT32 Slope;
01862 
01863   if (SampleCount < kCountTable[0])
01864     return kBucketsTable[0];
01865 
01866   for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
01867     if (SampleCount <= kCountTable[Next]) {
01868       Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
01869           (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
01870       return ((uinT16) (kBucketsTable[Last] +
01871           Slope * (SampleCount - kCountTable[Last])));
01872     }
01873   }
01874   return kBucketsTable[Last];
01875 }                                // OptimumNumberOfBuckets
01876 
01877 
01896 FLOAT64
01897 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
01898 #define CHIACCURACY     0.01
01899 #define MINALPHA  (1e-200)
01900 {
01901   static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
01902 
01903   CHISTRUCT *OldChiSquared;
01904   CHISTRUCT SearchKey;
01905 
01906   // limit the minimum alpha that can be used - if alpha is too small
01907   //      it may not be possible to compute chi-squared.
01908   Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
01909   if (Odd (DegreesOfFreedom))
01910     DegreesOfFreedom++;
01911 
01912   /* find the list of chi-squared values which have already been computed
01913      for the specified number of degrees of freedom.  Search the list for
01914      the desired chi-squared. */
01915   SearchKey.Alpha = Alpha;
01916   OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
01917     &SearchKey, AlphaMatch));
01918 
01919   if (OldChiSquared == NULL) {
01920     OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
01921     OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
01922       (FLOAT64) DegreesOfFreedom,
01923       (FLOAT64) CHIACCURACY);
01924     ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
01925       OldChiSquared);
01926   }
01927   else {
01928     // further optimization might move OldChiSquared to front of list
01929   }
01930 
01931   return (OldChiSquared->ChiSquared);
01932 
01933 }                                // ComputeChiSquared
01934 
01935 
01951 FLOAT64 NormalDensity(inT32 x) {
01952   FLOAT64 Distance;
01953 
01954   Distance = x - kNormalMean;
01955   return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
01956 }                                // NormalDensity
01957 
01958 
01968 FLOAT64 UniformDensity(inT32 x) {
01969   static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
01970 
01971   if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
01972     return UniformDistributionDensity;
01973   else
01974     return (FLOAT64) 0.0;
01975 }                                // UniformDensity
01976 
01977 
01988 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
01989   return (f1 + f2) * Dx / 2.0;
01990 }                                // Integral
01991 
01992 
02015 void FillBuckets(BUCKETS *Buckets,
02016                  CLUSTER *Cluster,
02017                  uinT16 Dim,
02018                  PARAM_DESC *ParamDesc,
02019                  FLOAT32 Mean,
02020                  FLOAT32 StdDev) {
02021   uinT16 BucketID;
02022   int i;
02023   LIST SearchState;
02024   SAMPLE *Sample;
02025 
02026   // initialize the histogram bucket counts to 0
02027   for (i = 0; i < Buckets->NumberOfBuckets; i++)
02028     Buckets->Count[i] = 0;
02029 
02030   if (StdDev == 0.0) {
02031     /* if the standard deviation is zero, then we can't statistically
02032        analyze the cluster.  Use a pseudo-analysis: samples exactly on
02033        the mean are distributed evenly across all buckets.  Samples greater
02034        than the mean are placed in the last bucket; samples less than the
02035        mean are placed in the first bucket. */
02036 
02037     InitSampleSearch(SearchState, Cluster);
02038     i = 0;
02039     while ((Sample = NextSample (&SearchState)) != NULL) {
02040       if (Sample->Mean[Dim] > Mean)
02041         BucketID = Buckets->NumberOfBuckets - 1;
02042       else if (Sample->Mean[Dim] < Mean)
02043         BucketID = 0;
02044       else
02045         BucketID = i;
02046       Buckets->Count[BucketID] += 1;
02047       i++;
02048       if (i >= Buckets->NumberOfBuckets)
02049         i = 0;
02050     }
02051   }
02052   else {
02053     // search for all samples in the cluster and add to histogram buckets
02054     InitSampleSearch(SearchState, Cluster);
02055     while ((Sample = NextSample (&SearchState)) != NULL) {
02056       switch (Buckets->Distribution) {
02057         case normal:
02058           BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
02059             Mean, StdDev);
02060           break;
02061         case D_random:
02062         case uniform:
02063           BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
02064             Mean, StdDev);
02065           break;
02066         default:
02067           BucketID = 0;
02068       }
02069       Buckets->Count[Buckets->Bucket[BucketID]] += 1;
02070     }
02071   }
02072 }                                // FillBuckets
02073 
02074 
02088 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
02089                     FLOAT32 x,
02090                     FLOAT32 Mean,
02091                     FLOAT32 StdDev) {
02092   FLOAT32 X;
02093 
02094   // wraparound circular parameters if necessary
02095   if (ParamDesc->Circular) {
02096     if (x - Mean > ParamDesc->HalfRange)
02097       x -= ParamDesc->Range;
02098     else if (x - Mean < -ParamDesc->HalfRange)
02099       x += ParamDesc->Range;
02100   }
02101 
02102   X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
02103   if (X < 0)
02104     return 0;
02105   if (X > BUCKETTABLESIZE - 1)
02106     return ((uinT16) (BUCKETTABLESIZE - 1));
02107   return (uinT16) floor((FLOAT64) X);
02108 }                                // NormalBucket
02109 
02110 
02124 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
02125                      FLOAT32 x,
02126                      FLOAT32 Mean,
02127                      FLOAT32 StdDev) {
02128   FLOAT32 X;
02129 
02130   // wraparound circular parameters if necessary
02131   if (ParamDesc->Circular) {
02132     if (x - Mean > ParamDesc->HalfRange)
02133       x -= ParamDesc->Range;
02134     else if (x - Mean < -ParamDesc->HalfRange)
02135       x += ParamDesc->Range;
02136   }
02137 
02138   X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
02139   if (X < 0)
02140     return 0;
02141   if (X > BUCKETTABLESIZE - 1)
02142     return (uinT16) (BUCKETTABLESIZE - 1);
02143   return (uinT16) floor((FLOAT64) X);
02144 }                                // UniformBucket
02145 
02146 
02159 BOOL8 DistributionOK(BUCKETS *Buckets) {
02160   FLOAT32 FrequencyDifference;
02161   FLOAT32 TotalDifference;
02162   int i;
02163 
02164   // compute how well the histogram matches the expected histogram
02165   TotalDifference = 0.0;
02166   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02167     FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
02168     TotalDifference += (FrequencyDifference * FrequencyDifference) /
02169       Buckets->ExpectedCount[i];
02170   }
02171 
02172   // test to see if the difference is more than expected
02173   if (TotalDifference > Buckets->ChiSquared)
02174     return FALSE;
02175   else
02176     return TRUE;
02177 }                                // DistributionOK
02178 
02179 
02188 void FreeStatistics(STATISTICS *Statistics) {
02189   memfree (Statistics->CoVariance);
02190   memfree (Statistics->Min);
02191   memfree (Statistics->Max);
02192   memfree(Statistics);
02193 }                                // FreeStatistics
02194 
02195 
02201 void FreeBuckets(BUCKETS *buckets) {
02202   Efree(buckets->Count);
02203   Efree(buckets->ExpectedCount);
02204   Efree(buckets);
02205 }                                // FreeBuckets
02206 
02207 
02220 void FreeCluster(CLUSTER *Cluster) {
02221   if (Cluster != NULL) {
02222     FreeCluster (Cluster->Left);
02223     FreeCluster (Cluster->Right);
02224     memfree(Cluster);
02225   }
02226 }                                // FreeCluster
02227 
02228 
02243 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
02244   static uinT8 DegreeOffsets[] = { 3, 3, 1 };
02245 
02246   uinT16 AdjustedNumBuckets;
02247 
02248   AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
02249   if (Odd (AdjustedNumBuckets))
02250     AdjustedNumBuckets++;
02251   return (AdjustedNumBuckets);
02252 
02253 }                                // DegreesOfFreedom
02254 
02255 
02266 int NumBucketsMatch(void *arg1,    // BUCKETS *Histogram,
02267                     void *arg2) {  // uinT16 *DesiredNumberOfBuckets)
02268   BUCKETS *Histogram = (BUCKETS *) arg1;
02269   uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
02270 
02271   return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
02272 
02273 }                                // NumBucketsMatch
02274 
02275 
02284 int ListEntryMatch(void *arg1,    //ListNode
02285                    void *arg2) {  //Key
02286   return (arg1 == arg2);
02287 
02288 }                                // ListEntryMatch
02289 
02290 
02301 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
02302   int i;
02303   FLOAT64 AdjustFactor;
02304 
02305   AdjustFactor = (((FLOAT64) NewSampleCount) /
02306     ((FLOAT64) Buckets->SampleCount));
02307 
02308   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02309     Buckets->ExpectedCount[i] *= AdjustFactor;
02310   }
02311 
02312   Buckets->SampleCount = NewSampleCount;
02313 
02314 }                                // AdjustBuckets
02315 
02316 
02325 void InitBuckets(BUCKETS *Buckets) {
02326   int i;
02327 
02328   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02329     Buckets->Count[i] = 0;
02330   }
02331 
02332 }                                // InitBuckets
02333 
02334 
02349 int AlphaMatch(void *arg1,    //CHISTRUCT                             *ChiStruct,
02350                void *arg2) {  //CHISTRUCT                             *SearchKey)
02351   CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
02352   CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
02353 
02354   return (ChiStruct->Alpha == SearchKey->Alpha);
02355 
02356 }                                // AlphaMatch
02357 
02358 
02370 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
02371   CHISTRUCT *NewChiStruct;
02372 
02373   NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
02374   NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
02375   NewChiStruct->Alpha = Alpha;
02376   return (NewChiStruct);
02377 
02378 }                                // NewChiStruct
02379 
02380 
02396 FLOAT64
02397 Solve (SOLVEFUNC Function,
02398 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
02399 #define INITIALDELTA    0.1
02400 #define  DELTARATIO     0.1
02401 {
02402   FLOAT64 x;
02403   FLOAT64 f;
02404   FLOAT64 Slope;
02405   FLOAT64 Delta;
02406   FLOAT64 NewDelta;
02407   FLOAT64 xDelta;
02408   FLOAT64 LastPosX, LastNegX;
02409 
02410   x = InitialGuess;
02411   Delta = INITIALDELTA;
02412   LastPosX = MAX_FLOAT32;
02413   LastNegX = -MAX_FLOAT32;
02414   f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02415   while (Abs (LastPosX - LastNegX) > Accuracy) {
02416     // keep track of outer bounds of current estimate
02417     if (f < 0)
02418       LastNegX = x;
02419     else
02420       LastPosX = x;
02421 
02422     // compute the approx. slope of f(x) at the current point
02423     Slope =
02424       ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
02425 
02426     // compute the next solution guess */
02427     xDelta = f / Slope;
02428     x -= xDelta;
02429 
02430     // reduce the delta used for computing slope to be a fraction of
02431     //the amount moved to get to the new guess
02432     NewDelta = Abs (xDelta) * DELTARATIO;
02433     if (NewDelta < Delta)
02434       Delta = NewDelta;
02435 
02436     // compute the value of the function at the new guess
02437     f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02438   }
02439   return (x);
02440 
02441 }                                // Solve
02442 
02443 
02464 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
02465   int i, N;
02466   FLOAT64 SeriesTotal;
02467   FLOAT64 Denominator;
02468   FLOAT64 PowerOfx;
02469 
02470   N = ChiParams->DegreesOfFreedom / 2 - 1;
02471   SeriesTotal = 1;
02472   Denominator = 1;
02473   PowerOfx = 1;
02474   for (i = 1; i <= N; i++) {
02475     Denominator *= 2 * i;
02476     PowerOfx *= x;
02477     SeriesTotal += PowerOfx / Denominator;
02478   }
02479   return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
02480 
02481 }                                // ChiArea
02482 
02483 
02511 BOOL8
02512 MultipleCharSamples (CLUSTERER * Clusterer,
02513 CLUSTER * Cluster, FLOAT32 MaxIllegal)
02514 #define ILLEGAL_CHAR    2
02515 {
02516   static BOOL8 *CharFlags = NULL;
02517   static inT32 NumFlags = 0;
02518   int i;
02519   LIST SearchState;
02520   SAMPLE *Sample;
02521   inT32 CharID;
02522   inT32 NumCharInCluster;
02523   inT32 NumIllegalInCluster;
02524   FLOAT32 PercentIllegal;
02525 
02526   // initial estimate assumes that no illegal chars exist in the cluster
02527   NumCharInCluster = Cluster->SampleCount;
02528   NumIllegalInCluster = 0;
02529 
02530   if (Clusterer->NumChar > NumFlags) {
02531     if (CharFlags != NULL)
02532       memfree(CharFlags);
02533     NumFlags = Clusterer->NumChar;
02534     CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
02535   }
02536 
02537   for (i = 0; i < NumFlags; i++)
02538     CharFlags[i] = FALSE;
02539 
02540   // find each sample in the cluster and check if we have seen it before
02541   InitSampleSearch(SearchState, Cluster);
02542   while ((Sample = NextSample (&SearchState)) != NULL) {
02543     CharID = Sample->CharID;
02544     if (CharFlags[CharID] == FALSE) {
02545       CharFlags[CharID] = TRUE;
02546     }
02547     else {
02548       if (CharFlags[CharID] == TRUE) {
02549         NumIllegalInCluster++;
02550         CharFlags[CharID] = ILLEGAL_CHAR;
02551       }
02552       NumCharInCluster--;
02553       PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
02554       if (PercentIllegal > MaxIllegal) {
02555         destroy(SearchState);
02556         return (TRUE);
02557       }
02558     }
02559   }
02560   return (FALSE);
02561 
02562 }                                // MultipleCharSamples
02563 
02569 double InvertMatrix(const float* input, int size, float* inv) {
02570   // Allocate memory for the 2D arrays.
02571   GENERIC_2D_ARRAY<double> U(size, size, 0.0);
02572   GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
02573   GENERIC_2D_ARRAY<double> L(size, size, 0.0);
02574 
02575   // Initialize the working matrices. U starts as input, L as I and U_inv as O.
02576   int row;
02577   int col;
02578   for (row = 0; row < size; row++) {
02579     for (col = 0; col < size; col++) {
02580       U[row][col] = input[row*size + col];
02581       L[row][col] = row == col ? 1.0 : 0.0;
02582       U_inv[row][col] = 0.0;
02583     }
02584   }
02585 
02586   // Compute forward matrix by inversion by LU decomposition of input.
02587   for (col = 0; col < size; ++col) {
02588     // Find best pivot
02589     int best_row = 0;
02590     double best_pivot = -1.0;
02591     for (row = col; row < size; ++row) {
02592       if (Abs(U[row][col]) > best_pivot) {
02593         best_pivot = Abs(U[row][col]);
02594         best_row = row;
02595       }
02596     }
02597     // Exchange pivot rows.
02598     if (best_row != col) {
02599       for (int k = 0; k < size; ++k) {
02600         double tmp = U[best_row][k];
02601         U[best_row][k] = U[col][k];
02602         U[col][k] = tmp;
02603         tmp = L[best_row][k];
02604         L[best_row][k] = L[col][k];
02605         L[col][k] = tmp;
02606       }
02607     }
02608     // Now do the pivot itself.
02609     for (row = col + 1; row < size; ++row) {
02610       double ratio = -U[row][col] / U[col][col];
02611       for (int j = col; j < size; ++j) {
02612         U[row][j] += U[col][j] * ratio;
02613       }
02614       for (int k = 0; k < size; ++k) {
02615         L[row][k] += L[col][k] * ratio;
02616       }
02617     }
02618   }
02619   // Next invert U.
02620   for (col = 0; col < size; ++col) {
02621     U_inv[col][col] = 1.0 / U[col][col];
02622     for (row = col - 1; row >= 0; --row) {
02623       double total = 0.0;
02624       for (int k = col; k > row; --k) {
02625         total += U[row][k] * U_inv[k][col];
02626       }
02627       U_inv[row][col] = -total / U[row][row];
02628     }
02629   }
02630   // Now the answer is U_inv.L.
02631   for (row = 0; row < size; row++) {
02632     for (col = 0; col < size; col++) {
02633       double sum = 0.0;
02634       for (int k = row; k < size; ++k) {
02635         sum += U_inv[row][k] * L[k][col];
02636       }
02637       inv[row*size + col] = sum;
02638     }
02639   }
02640   // Check matrix product.
02641   double error_sum = 0.0;
02642   for (row = 0; row < size; row++) {
02643     for (col = 0; col < size; col++) {
02644       double sum = 0.0;
02645       for (int k = 0; k < size; ++k) {
02646         sum += input[row*size + k] * inv[k *size + col];
02647       }
02648       if (row != col) {
02649         error_sum += Abs(sum);
02650       }
02651     }
02652   }
02653   return error_sum;
02654 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines