001/*-
002 * Copyright 2016 Diamond Light Source Ltd.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the Eclipse Public License v1.0
006 * which accompanies this distribution, and is available at
007 * http://www.eclipse.org/legal/epl-v10.html
008 */
009
010package org.eclipse.january.dataset;
011
012/**
013 * Estimators of the scale of a Dataset.
014 * <p>
015 * A class of static methods to produce estimations of the scale of variation within a Dataset.
016 * The available estimators are:
017 * <ul>
018 * <li> Median Absolute Deviation </li>
019 * <li> S<sub>n</sub> of Croux and Rousseeuw (1992).</li>
020 * </ul> 
021 * <p>
022 * Croux, C. and P. J. Rousseeuw, "Time-efficient algorithms for two highly robust estimators of scale", Computational Statistics, Volume 1, eds. Y. Dodge and J.Whittaker, Physica-Verlag, Heidelberg, pp411--428 (1992).
023 */
024public class Outliers {
025
026        private final static double MADSCALEFACTOR = 1.4826;
027        private final static double SNSCALEFACTOR = 1.1926;
028        
029        /**
030         * Returns the Median Absolute Deviation (MAD) and the median. 
031         * @param data
032         *                      The data for which the median and the MAD are to be calculated
033         * @return A two-element array of doubles, consisting of the MAD and the median of the data
034         */
035        public static double[] medianAbsoluteDeviation(Dataset data) {
036                
037                double median = (Double)Stats.median(data);
038                data = Maths.subtract(data, median);
039                data = Maths.abs(data);
040                double median2 = (Double)Stats.median(data);
041                double mad = MADSCALEFACTOR * median2;
042                
043                return new double[]{mad, median};
044        }
045        
046        /**
047         * Returns the Sn estimator of Croux and Rousseeuw.
048         * <p>
049         * This is the simple O(n²) version of the calculation algorithm.
050         * @param data
051         *                      The data for which the estimator is to be calculated.
052         * @return The value of the Sn estimator for the data
053         */
054        public static double snNaive(Dataset data) {
055                
056                Dataset medAbs = DatasetFactory.zeros(data);
057                Dataset dif = DatasetFactory.zeros(data);
058                
059                IndexIterator it = data.getIterator();
060                int count = 0;
061                while (it.hasNext()) {
062                        double val = data.getElementDoubleAbs(it.index);
063                        Maths.subtract(data, val, dif);
064                        Maths.abs(dif,dif);
065                        //Lower median - Math.floor((n/2)+1) of sorted
066                        dif.sort(null);
067                        medAbs.setObjectAbs(count++, lowMed(dif));
068                }
069                
070                
071                //Higher median - Math.floor((n+1)/2) of sorted
072                medAbs.sort(null);
073                double median = highMed(medAbs);
074                
075                return median * SNSCALEFACTOR;
076        }
077        
078        /**
079         * Returns the Sn estimator of Croux and Rousseeuw.
080         * <p>
081         * This is the complex O(nlog n) version of the calculation algorithm.
082         * @param data
083         *                      The data for which the estimator is to be calculated.
084         * @return The value of the Sn estimator for the data
085         */
086        public static double snFast(Dataset data) {
087                
088                Dataset sorted = data.clone();
089                sorted.sort(null);
090                
091                Dataset medAbs = DatasetFactory.zeros(data);
092                
093                IndexIterator it = data.getIterator();
094                int count = 0;
095                while (it.hasNext()) {
096                        MedianOfTwoSortedSets snuff = new MedianForSn(sorted, it.index);
097                        medAbs.setObjectAbs(count++, snuff.get());
098                }
099                
100                
101                //Higher median - Math.floor((n+1)/2) of sorted
102                medAbs.sort(null);
103                double median = highMed(medAbs);
104                
105                return median * SNSCALEFACTOR;
106        }
107        
108        /**
109         * Returns the lomed
110         * <p>
111         * Returns the lomed (low median) of a sorted Dataset.
112         * @param data
113         *                      A sorted Dataset for which the low median is to be calculated.  
114         * @return
115         *              The value of the lomed of the data
116         */
117        public static double lowMed(Dataset data) {
118                return data.getElementDoubleAbs((int)Math.floor((data.getSize()/2)));
119        }
120        
121        /**
122         * Returns the himed
123         * <p>
124         * Returns the himed (high median) of a sorted Dataset.
125         * @param data
126         *                      A sorted Dataset for which the low median is to be calculated.  
127         * @return
128         *              The value of the himed of the data
129         */
130        public static double highMed(Dataset data) {
131                return data.getElementDoubleAbs((int)Math.floor((data.getSize()+1)/2-1));
132        }
133
134        /**
135         * Calculates the overall median of two double arrays
136         * @param a
137         * @param b
138         *                      the two arrays for which the overall median is desired. 
139         * @return the overall median of the two arrays
140         */
141        public static double medianOFTwoPrimitiveArrays (double[] a, double[] b) {
142                MedianOfTwoArrays medio = new MedianOfTwoArrays(a, b);
143                return medio.get();
144        }
145}
146
147/**
148 * Allows the calculation of the median of two arrays.
149 * <p>
150 * Subclasses must implement getA() and getB() to return the elements of A or B
151 * at the given index. The length of a must be less than or equal to that of b.
152 * The constructor must set the sizes nA and nB.
153 */
154abstract class MedianOfTwoSortedSets{
155        int nB, nA, diff, diffLeft;
156        
157        public final double get() {
158                // Initialize the left and right markers for the set of candidate 
159                // values. These are inclusive on both left and right.
160                int leftA = 0, leftB = 0;
161                @SuppressWarnings("unused") // keep rightA for symmetry
162                int rightA = nB-1, rightB = nB-1;
163                
164                while (nB > 1) {
165                        // For 0-based indexing, the lomed is the element at floor((n+1)/2)-1
166                        int medianIndex = (int) Math.floor((nB+1)/2)-1;
167                        int medianAIndex = leftA + medianIndex,
168                                        medianBIndex = leftB + medianIndex;
169                        double medA = getAm(medianAIndex),
170                                        medB = getBm(medianBIndex);
171
172                        int smallerShift = 0;
173                        if (nB % 2 == 0) {
174                                // N even: the smaller lomed, as well as anything smaller than it, cannot be the overall median
175                                smallerShift = +1;
176                        }
177                        
178                        if (medA >= medB) {
179                                rightA = medianAIndex;
180                                leftB = medianBIndex + smallerShift;
181                        } else {
182                                rightB = medianBIndex;
183                                leftA = medianAIndex + smallerShift;
184                        }
185                        
186                        // Different lengths
187                        // It should be floor((l_m-1 + 1)/2))
188                        // this is newLength, defined above
189                        // Difference between left and right
190                        nB = rightB - leftB + 1;
191                }
192
193                // when the array is length 1, right and left will be the same.
194                // The lomed of a two element array is the smaller of the two
195                return Math.min(getAm(leftA), getBm(leftB));
196        }
197        
198        // Get the value in the expanded array
199        private double getAm(int i) {
200                int firstElement = diffLeft,
201                                lastElement = diffLeft + nA - 1;
202                if (i < firstElement) {
203                        return Double.NEGATIVE_INFINITY;
204                } else if (i > lastElement) {
205                        return Double.POSITIVE_INFINITY;
206                } else {
207                        return getA(i - diffLeft);
208                }
209        }
210        
211        private double getBm(int i) {
212                return getB(i);
213        }
214
215        // Get the values in the original arrays
216        protected abstract double getA(int i);  
217        protected abstract double getB(int i);
218        
219        // Call this to set up the length difference variables. 
220        protected void setDiffs() {
221                diff = nB - nA;
222                diffLeft = diff/2;
223        }
224}
225
226class MedianOfTwoArrays extends MedianOfTwoSortedSets {
227
228        double[] a, b;
229
230        public MedianOfTwoArrays(double[] ain, double[] bin) {
231                if (bin.length >= ain.length) {
232                        this.a = ain;
233                        nA = ain.length;
234                        this.b = bin;
235                        nB = bin.length;
236                } else {
237                        this.a = bin;
238                        nA = bin.length;
239                        this.b = ain;
240                        nB = ain.length;
241                }
242                setDiffs();
243        }
244        @Override
245        protected double getA(int i) {
246                return a[i];
247        }
248        @Override
249        protected double getB(int i) {
250                return b[i];
251        }
252}
253
254class MedianForSn extends MedianOfTwoSortedSets {
255        Dataset xj;
256        int referenceIndex;
257        boolean lowerIsBigger;
258
259        public MedianForSn(Dataset xj, int referenceIndex) {
260                this.xj = xj;
261                this.referenceIndex = referenceIndex;
262
263                // determine which of the two halves of the array is larger
264                int lowerSize = referenceIndex, upperSize = xj.getSize() - referenceIndex - 1;
265                lowerIsBigger = lowerSize > upperSize;
266
267                // Set the array sizes
268                if (lowerIsBigger) {
269                        nA = upperSize;
270                        nB = lowerSize;
271                } else {
272                        nA = lowerSize;
273                        nB = upperSize;
274                }
275                setDiffs();
276        }
277
278        @Override
279        protected double getA(int i) {
280                return (!lowerIsBigger) ? getLower(i) : getUpper(i);
281        }
282        @Override
283        protected double getB(int i) {
284                return (!lowerIsBigger) ? getUpper(i) : getLower(i);
285        }
286        
287        private double getLower(int i) {
288                return xj.getDouble(referenceIndex) - xj.getDouble(referenceIndex - 1 - i);
289        }
290        private double getUpper(int i) {
291                return xj.getDouble(i + referenceIndex + 1) - xj.getDouble(referenceIndex);
292        }
293}