001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.math3.distribution; 019 020 import org.apache.commons.math3.exception.NotStrictlyPositiveException; 021 import org.apache.commons.math3.exception.util.LocalizedFormats; 022 import org.apache.commons.math3.util.FastMath; 023 import org.apache.commons.math3.random.RandomGenerator; 024 import org.apache.commons.math3.random.Well19937c; 025 026 /** 027 * Implementation of the Zipf distribution. 028 * 029 * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a> 030 * @version $Id: ZipfDistribution.java 1416643 2012-12-03 19:37:14Z tn $ 031 */ 032 public class ZipfDistribution extends AbstractIntegerDistribution { 033 /** Serializable version identifier. */ 034 private static final long serialVersionUID = -140627372283420404L; 035 /** Number of elements. */ 036 private final int numberOfElements; 037 /** Exponent parameter of the distribution. */ 038 private final double exponent; 039 /** Cached numerical mean */ 040 private double numericalMean = Double.NaN; 041 /** Whether or not the numerical mean has been calculated */ 042 private boolean numericalMeanIsCalculated = false; 043 /** Cached numerical variance */ 044 private double numericalVariance = Double.NaN; 045 /** Whether or not the numerical variance has been calculated */ 046 private boolean numericalVarianceIsCalculated = false; 047 048 /** 049 * Create a new Zipf distribution with the given number of elements and 050 * exponent. 051 * 052 * @param numberOfElements Number of elements. 053 * @param exponent Exponent. 054 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} 055 * or {@code exponent <= 0}. 056 */ 057 public ZipfDistribution(final int numberOfElements, final double exponent) { 058 this(new Well19937c(), numberOfElements, exponent); 059 } 060 061 /** 062 * Creates a Zipf distribution. 063 * 064 * @param rng Random number generator. 065 * @param numberOfElements Number of elements. 066 * @param exponent Exponent. 067 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} 068 * or {@code exponent <= 0}. 069 * @since 3.1 070 */ 071 public ZipfDistribution(RandomGenerator rng, 072 int numberOfElements, 073 double exponent) 074 throws NotStrictlyPositiveException { 075 super(rng); 076 077 if (numberOfElements <= 0) { 078 throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION, 079 numberOfElements); 080 } 081 if (exponent <= 0) { 082 throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT, 083 exponent); 084 } 085 086 this.numberOfElements = numberOfElements; 087 this.exponent = exponent; 088 } 089 090 /** 091 * Get the number of elements (e.g. corpus size) for the distribution. 092 * 093 * @return the number of elements 094 */ 095 public int getNumberOfElements() { 096 return numberOfElements; 097 } 098 099 /** 100 * Get the exponent characterizing the distribution. 101 * 102 * @return the exponent 103 */ 104 public double getExponent() { 105 return exponent; 106 } 107 108 /** {@inheritDoc} */ 109 public double probability(final int x) { 110 if (x <= 0 || x > numberOfElements) { 111 return 0.0; 112 } 113 114 return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent); 115 } 116 117 /** {@inheritDoc} */ 118 public double cumulativeProbability(final int x) { 119 if (x <= 0) { 120 return 0.0; 121 } else if (x >= numberOfElements) { 122 return 1.0; 123 } 124 125 return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent); 126 } 127 128 /** 129 * {@inheritDoc} 130 * 131 * For number of elements {@code N} and exponent {@code s}, the mean is 132 * {@code Hs1 / Hs}, where 133 * <ul> 134 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 135 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 136 * </ul> 137 */ 138 public double getNumericalMean() { 139 if (!numericalMeanIsCalculated) { 140 numericalMean = calculateNumericalMean(); 141 numericalMeanIsCalculated = true; 142 } 143 return numericalMean; 144 } 145 146 /** 147 * Used by {@link #getNumericalMean()}. 148 * 149 * @return the mean of this distribution 150 */ 151 protected double calculateNumericalMean() { 152 final int N = getNumberOfElements(); 153 final double s = getExponent(); 154 155 final double Hs1 = generalizedHarmonic(N, s - 1); 156 final double Hs = generalizedHarmonic(N, s); 157 158 return Hs1 / Hs; 159 } 160 161 /** 162 * {@inheritDoc} 163 * 164 * For number of elements {@code N} and exponent {@code s}, the mean is 165 * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where 166 * <ul> 167 * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li> 168 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 169 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 170 * </ul> 171 */ 172 public double getNumericalVariance() { 173 if (!numericalVarianceIsCalculated) { 174 numericalVariance = calculateNumericalVariance(); 175 numericalVarianceIsCalculated = true; 176 } 177 return numericalVariance; 178 } 179 180 /** 181 * Used by {@link #getNumericalVariance()}. 182 * 183 * @return the variance of this distribution 184 */ 185 protected double calculateNumericalVariance() { 186 final int N = getNumberOfElements(); 187 final double s = getExponent(); 188 189 final double Hs2 = generalizedHarmonic(N, s - 2); 190 final double Hs1 = generalizedHarmonic(N, s - 1); 191 final double Hs = generalizedHarmonic(N, s); 192 193 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs)); 194 } 195 196 /** 197 * Calculates the Nth generalized harmonic number. See 198 * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic 199 * Series</a>. 200 * 201 * @param n Term in the series to calculate (must be larger than 1) 202 * @param m Exponent (special case {@code m = 1} is the harmonic series). 203 * @return the n<sup>th</sup> generalized harmonic number. 204 */ 205 private double generalizedHarmonic(final int n, final double m) { 206 double value = 0; 207 for (int k = n; k > 0; --k) { 208 value += 1.0 / FastMath.pow(k, m); 209 } 210 return value; 211 } 212 213 /** 214 * {@inheritDoc} 215 * 216 * The lower bound of the support is always 1 no matter the parameters. 217 * 218 * @return lower bound of the support (always 1) 219 */ 220 public int getSupportLowerBound() { 221 return 1; 222 } 223 224 /** 225 * {@inheritDoc} 226 * 227 * The upper bound of the support is the number of elements. 228 * 229 * @return upper bound of the support 230 */ 231 public int getSupportUpperBound() { 232 return getNumberOfElements(); 233 } 234 235 /** 236 * {@inheritDoc} 237 * 238 * The support of this distribution is connected. 239 * 240 * @return {@code true} 241 */ 242 public boolean isSupportConnected() { 243 return true; 244 } 245 } 246