001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.math3.stat.inference; 018 019 import org.apache.commons.math3.distribution.FDistribution; 020 import org.apache.commons.math3.exception.ConvergenceException; 021 import org.apache.commons.math3.exception.DimensionMismatchException; 022 import org.apache.commons.math3.exception.MaxCountExceededException; 023 import org.apache.commons.math3.exception.NullArgumentException; 024 import org.apache.commons.math3.exception.OutOfRangeException; 025 import org.apache.commons.math3.exception.util.LocalizedFormats; 026 import org.apache.commons.math3.stat.descriptive.summary.Sum; 027 import org.apache.commons.math3.stat.descriptive.summary.SumOfSquares; 028 029 import java.util.Collection; 030 031 /** 032 * Implements one-way ANOVA (analysis of variance) statistics. 033 * 034 * <p> Tests for differences between two or more categories of univariate data 035 * (for example, the body mass index of accountants, lawyers, doctors and 036 * computer programmers). When two categories are given, this is equivalent to 037 * the {@link org.apache.commons.math3.stat.inference.TTest}. 038 * </p><p> 039 * Uses the {@link org.apache.commons.math3.distribution.FDistribution 040 * commons-math F Distribution implementation} to estimate exact p-values.</p> 041 * <p>This implementation is based on a description at 042 * http://faculty.vassar.edu/lowry/ch13pt1.html</p> 043 * <pre> 044 * Abbreviations: bg = between groups, 045 * wg = within groups, 046 * ss = sum squared deviations 047 * </pre> 048 * 049 * @since 1.2 050 * @version $Id: OneWayAnova.java 1416643 2012-12-03 19:37:14Z tn $ 051 */ 052 public class OneWayAnova { 053 054 /** 055 * Default constructor. 056 */ 057 public OneWayAnova() { 058 } 059 060 /** 061 * Computes the ANOVA F-value for a collection of <code>double[]</code> 062 * arrays. 063 * 064 * <p><strong>Preconditions</strong>: <ul> 065 * <li>The categoryData <code>Collection</code> must contain 066 * <code>double[]</code> arrays.</li> 067 * <li> There must be at least two <code>double[]</code> arrays in the 068 * <code>categoryData</code> collection and each of these arrays must 069 * contain at least two values.</li></ul></p><p> 070 * This implementation computes the F statistic using the definitional 071 * formula<pre> 072 * F = msbg/mswg</pre> 073 * where<pre> 074 * msbg = between group mean square 075 * mswg = within group mean square</pre> 076 * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html"> 077 * here</a></p> 078 * 079 * @param categoryData <code>Collection</code> of <code>double[]</code> 080 * arrays each containing data for one category 081 * @return Fvalue 082 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 083 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 084 * array is less than 2 or a contained <code>double[]</code> array does not have 085 * at least two values 086 */ 087 public double anovaFValue(final Collection<double[]> categoryData) 088 throws NullArgumentException, DimensionMismatchException { 089 090 AnovaStats a = anovaStats(categoryData); 091 return a.F; 092 093 } 094 095 /** 096 * Computes the ANOVA P-value for a collection of <code>double[]</code> 097 * arrays. 098 * 099 * <p><strong>Preconditions</strong>: <ul> 100 * <li>The categoryData <code>Collection</code> must contain 101 * <code>double[]</code> arrays.</li> 102 * <li> There must be at least two <code>double[]</code> arrays in the 103 * <code>categoryData</code> collection and each of these arrays must 104 * contain at least two values.</li></ul></p><p> 105 * This implementation uses the 106 * {@link org.apache.commons.math3.distribution.FDistribution 107 * commons-math F Distribution implementation} to estimate the exact 108 * p-value, using the formula<pre> 109 * p = 1 - cumulativeProbability(F)</pre> 110 * where <code>F</code> is the F value and <code>cumulativeProbability</code> 111 * is the commons-math implementation of the F distribution.</p> 112 * 113 * @param categoryData <code>Collection</code> of <code>double[]</code> 114 * arrays each containing data for one category 115 * @return Pvalue 116 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 117 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 118 * array is less than 2 or a contained <code>double[]</code> array does not have 119 * at least two values 120 * @throws ConvergenceException if the p-value can not be computed due to a convergence error 121 * @throws MaxCountExceededException if the maximum number of iterations is exceeded 122 */ 123 public double anovaPValue(final Collection<double[]> categoryData) 124 throws NullArgumentException, DimensionMismatchException, 125 ConvergenceException, MaxCountExceededException { 126 127 AnovaStats a = anovaStats(categoryData); 128 // No try-catch or advertised exception because args are valid 129 FDistribution fdist = new FDistribution(a.dfbg, a.dfwg); 130 return 1.0 - fdist.cumulativeProbability(a.F); 131 132 } 133 134 /** 135 * Performs an ANOVA test, evaluating the null hypothesis that there 136 * is no difference among the means of the data categories. 137 * 138 * <p><strong>Preconditions</strong>: <ul> 139 * <li>The categoryData <code>Collection</code> must contain 140 * <code>double[]</code> arrays.</li> 141 * <li> There must be at least two <code>double[]</code> arrays in the 142 * <code>categoryData</code> collection and each of these arrays must 143 * contain at least two values.</li> 144 * <li>alpha must be strictly greater than 0 and less than or equal to 0.5. 145 * </li></ul></p><p> 146 * This implementation uses the 147 * {@link org.apache.commons.math3.distribution.FDistribution 148 * commons-math F Distribution implementation} to estimate the exact 149 * p-value, using the formula<pre> 150 * p = 1 - cumulativeProbability(F)</pre> 151 * where <code>F</code> is the F value and <code>cumulativeProbability</code> 152 * is the commons-math implementation of the F distribution.</p> 153 * <p>True is returned iff the estimated p-value is less than alpha.</p> 154 * 155 * @param categoryData <code>Collection</code> of <code>double[]</code> 156 * arrays each containing data for one category 157 * @param alpha significance level of the test 158 * @return true if the null hypothesis can be rejected with 159 * confidence 1 - alpha 160 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 161 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 162 * array is less than 2 or a contained <code>double[]</code> array does not have 163 * at least two values 164 * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5] 165 * @throws ConvergenceException if the p-value can not be computed due to a convergence error 166 * @throws MaxCountExceededException if the maximum number of iterations is exceeded 167 */ 168 public boolean anovaTest(final Collection<double[]> categoryData, 169 final double alpha) 170 throws NullArgumentException, DimensionMismatchException, 171 OutOfRangeException, ConvergenceException, MaxCountExceededException { 172 173 if ((alpha <= 0) || (alpha > 0.5)) { 174 throw new OutOfRangeException( 175 LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, 176 alpha, 0, 0.5); 177 } 178 return anovaPValue(categoryData) < alpha; 179 180 } 181 182 /** 183 * This method actually does the calculations (except P-value). 184 * 185 * @param categoryData <code>Collection</code> of <code>double[]</code> 186 * arrays each containing data for one category 187 * @return computed AnovaStats 188 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 189 * @throws DimensionMismatchException if the length of the <code>categoryData</code> 190 * array is less than 2 or a contained <code>double[]</code> array does not contain 191 * at least two values 192 */ 193 private AnovaStats anovaStats(final Collection<double[]> categoryData) 194 throws NullArgumentException, DimensionMismatchException { 195 196 if (categoryData == null) { 197 throw new NullArgumentException(); 198 } 199 200 // check if we have enough categories 201 if (categoryData.size() < 2) { 202 throw new DimensionMismatchException( 203 LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED, 204 categoryData.size(), 2); 205 } 206 207 // check if each category has enough data and all is double[] 208 for (double[] array : categoryData) { 209 if (array.length <= 1) { 210 throw new DimensionMismatchException( 211 LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED, 212 array.length, 2); 213 } 214 } 215 216 int dfwg = 0; 217 double sswg = 0; 218 Sum totsum = new Sum(); 219 SumOfSquares totsumsq = new SumOfSquares(); 220 int totnum = 0; 221 222 for (double[] data : categoryData) { 223 224 Sum sum = new Sum(); 225 SumOfSquares sumsq = new SumOfSquares(); 226 int num = 0; 227 228 for (int i = 0; i < data.length; i++) { 229 double val = data[i]; 230 231 // within category 232 num++; 233 sum.increment(val); 234 sumsq.increment(val); 235 236 // for all categories 237 totnum++; 238 totsum.increment(val); 239 totsumsq.increment(val); 240 } 241 dfwg += num - 1; 242 double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num; 243 sswg += ss; 244 } 245 double sst = totsumsq.getResult() - totsum.getResult() * 246 totsum.getResult()/totnum; 247 double ssbg = sst - sswg; 248 int dfbg = categoryData.size() - 1; 249 double msbg = ssbg/dfbg; 250 double mswg = sswg/dfwg; 251 double F = msbg/mswg; 252 253 return new AnovaStats(dfbg, dfwg, F); 254 } 255 256 /** 257 Convenience class to pass dfbg,dfwg,F values around within OneWayAnova. 258 No get/set methods provided. 259 */ 260 private static class AnovaStats { 261 262 /** Degrees of freedom in numerator (between groups). */ 263 private final int dfbg; 264 265 /** Degrees of freedom in denominator (within groups). */ 266 private final int dfwg; 267 268 /** Statistic. */ 269 private final double F; 270 271 /** 272 * Constructor 273 * @param dfbg degrees of freedom in numerator (between groups) 274 * @param dfwg degrees of freedom in denominator (within groups) 275 * @param F statistic 276 */ 277 private AnovaStats(int dfbg, int dfwg, double F) { 278 this.dfbg = dfbg; 279 this.dfwg = dfwg; 280 this.F = F; 281 } 282 } 283 284 }