histogram/
config.rs

1use crate::Error;
2use core::ops::RangeInclusive;
3
4#[cfg(feature = "serde")]
5use serde::{Deserialize, Serialize};
6
7/// The configuration of a histogram which determines the bucketing strategy and
8/// therefore the relative error and memory utilization of a histogram.
9/// * `grouping_power` - controls the number of buckets that are used to span
10///   consecutive powers of two. Lower values result in less memory usage since
11///   fewer buckets will be created. However, this will result in larger
12///   relative error as each bucket represents a wider range of values.
13/// * `max_value_power` - controls the largest value which can be stored in the
14///   histogram. `2^(max_value_power) - 1` is the inclusive upper bound for the
15///   representable range of values.
16///
17/// # How to choose parameters for your data
18/// Please see <https://observablehq.com/@iopsystems/h2histogram> for an
19/// in-depth discussion about the bucketing strategy and an interactive
20/// calculator that lets you explore how these parameters result in histograms
21/// with varying error guarantees and memory utilization requirements.
22///
23/// # The short version
24/// ## Grouping Power
25/// `grouping_power` should be set such that `2^(-1 * grouping_power)` is an
26/// acceptable relative error. Rephrased, we can plug-in the acceptable
27/// relative error into `grouping_power = ceil(log2(1/e))`. For example, if we
28/// want to limit the error to 0.1% (0.001) we should set `grouping_power = 7`.
29///
30/// ## Max Value Power
31/// `max_value_power` should be the closest power of 2 that is larger than the
32/// largest value you expect in your data. If your only guarantee is that the
33/// values are all `u64`, then setting this to `64` may be reasonable if you
34/// can tolerate a bit of relative error.
35///
36/// ## Resulting size
37///
38/// If we want to allow any value in a range of unsigned types, the amount of
39/// memory for the histogram is approximately:
40///
41/// | power | error |     u16 |     u32 |     u64 |
42/// |-------|-------|---------|---------|---------|
43/// |     2 |   25% | 0.6 KiB |   1 KiB |   2 KiB |
44/// |     3 | 12.5% |   1 KiB |   2 KiB |   4 KiB |
45/// |     4 | 6.25% |   2 KiB |   4 KiB |   8 KiB |
46/// |     5 | 3.13% |   3 KiB |   7 KiB |  15 KiB |
47/// |     6 | 1.56% |   6 KiB |  14 KiB |  30 KiB |
48/// |     7 | .781% |  10 KiB |  26 KiB |  58 KiB |
49/// |     8 | .391% |  18 KiB |  50 KiB | 114 KiB |
50/// |     9 | .195% |  32 KiB |  96 KiB | 224 KiB |
51/// |    10 | .098% |  56 KiB | 184 KiB | 440 KiB |
52/// |    11 | .049% |  96 KiB | 352 KiB | 864 KiB |
53/// |    12 | .025% | 160 KiB | 672 KiB | 1.7 MiB |
54///
55/// # Constraints:
56/// * `max_value_power` must be in the range `0..=64`
57/// * `max_value_power` must be greater than `grouping_power
58#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
60#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
61pub struct Config {
62    max: u64,
63    grouping_power: u8,
64    max_value_power: u8,
65    cutoff_power: u8,
66    cutoff_value: u64,
67    lower_bin_count: u32,
68    upper_bin_divisions: u32,
69    upper_bin_count: u32,
70}
71
72impl Config {
73    /// Create a new histogram `Config` from the parameters. See the struct
74    /// documentation [`crate::Config`] for the meaning of the parameters and
75    /// their constraints.
76    pub const fn new(grouping_power: u8, max_value_power: u8) -> Result<Self, Error> {
77        // we only allow values up to 2^64
78        if max_value_power > 64 {
79            return Err(Error::MaxPowerTooHigh);
80        }
81
82        // check that the other parameters make sense together
83        if grouping_power >= max_value_power {
84            return Err(Error::MaxPowerTooLow);
85        }
86
87        // the cutoff is the point at which the linear range divisions and the
88        // logarithmic range subdivisions diverge.
89        //
90        // for example:
91        // when a = 0, the linear range has bins with width 1.
92        // if b = 7 the logarithmic range has 128 subdivisions.
93        // this means that for 0..128 we must be representing the values exactly
94        // but we also represent 128..256 exactly since the subdivisions divide
95        // that range into bins with the same width as the linear portion.
96        //
97        // therefore our cutoff power = a + b + 1
98
99        // note: because a + b must be less than n which is a u8, a + b + 1 must
100        // be less than or equal to u8::MAX. This means our cutoff power will
101        // always fit in a u8
102        let cutoff_power = grouping_power + 1;
103        let cutoff_value = 2_u64.pow(cutoff_power as u32);
104        let lower_bin_width = 2_u32.pow(0);
105        let upper_bin_divisions = 2_u32.pow(grouping_power as u32);
106
107        let max = if max_value_power == 64 {
108            u64::MAX
109        } else {
110            2_u64.pow(max_value_power as u32)
111        };
112
113        let lower_bin_count = (cutoff_value / lower_bin_width as u64) as u32;
114        let upper_bin_count = (max_value_power - cutoff_power) as u32 * upper_bin_divisions;
115
116        Ok(Self {
117            max,
118            grouping_power,
119            max_value_power,
120            cutoff_power,
121            cutoff_value,
122            lower_bin_count,
123            upper_bin_divisions,
124            upper_bin_count,
125        })
126    }
127
128    /// Returns the grouping power that was used to create this configuration.
129    pub const fn grouping_power(&self) -> u8 {
130        self.grouping_power
131    }
132
133    /// Returns the max value power that was used to create this configuration.
134    pub const fn max_value_power(&self) -> u8 {
135        self.max_value_power
136    }
137
138    /// Returns the relative error (in percentage) of this configuration. This
139    /// only applies to the logarithmic bins of the histogram (linear bins have
140    /// a width of 1 and no error). For histograms with no logarithmic bins,
141    /// error for the entire histogram is zero.
142    pub fn error(&self) -> f64 {
143        match self.grouping_power == self.max_value_power - 1 {
144            true => 0.0,
145            false => 100.0 / 2_u64.pow(self.grouping_power as u32) as f64,
146        }
147    }
148
149    /// Return the total number of buckets needed for this config.
150    pub const fn total_buckets(&self) -> usize {
151        (self.lower_bin_count + self.upper_bin_count) as usize
152    }
153
154    /// Converts a value to a bucket index. Returns an error if the value is
155    /// outside of the range for the config.
156    pub(crate) fn value_to_index(&self, value: u64) -> Result<usize, Error> {
157        if value < self.cutoff_value {
158            return Ok(value as usize);
159        }
160
161        if value > self.max {
162            return Err(Error::OutOfRange);
163        }
164
165        let power = 63 - value.leading_zeros();
166        let log_bin = power - self.cutoff_power as u32;
167        let offset = (value - (1 << power)) >> (power - self.grouping_power as u32);
168
169        Ok((self.lower_bin_count + log_bin * self.upper_bin_divisions + offset as u32) as usize)
170    }
171
172    /// Convert a bucket index to a lower bound.
173    pub(crate) fn index_to_lower_bound(&self, index: usize) -> u64 {
174        let g = index as u64 >> self.grouping_power;
175        let h = index as u64 - g * (1 << self.grouping_power);
176
177        if g < 1 {
178            h
179        } else {
180            (1 << (self.grouping_power as u64 + g - 1)) + (1 << (g - 1)) * h
181        }
182    }
183
184    /// Convert a bucket index to a upper inclusive bound.
185    pub(crate) fn index_to_upper_bound(&self, index: usize) -> u64 {
186        if index as u32 == self.lower_bin_count + self.upper_bin_count - 1 {
187            return self.max;
188        }
189        let g = index as u64 >> self.grouping_power;
190        let h = index as u64 - g * (1 << self.grouping_power) + 1;
191
192        if g < 1 {
193            h - 1
194        } else {
195            (1 << (self.grouping_power as u64 + g - 1)) + (1 << (g - 1)) * h - 1
196        }
197    }
198
199    /// Convert a bucket index to a range.
200    pub(crate) fn index_to_range(&self, index: usize) -> RangeInclusive<u64> {
201        self.index_to_lower_bound(index)..=self.index_to_upper_bound(index)
202    }
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[cfg(target_pointer_width = "64")]
210    #[test]
211    fn sizes() {
212        assert_eq!(std::mem::size_of::<Config>(), 32);
213    }
214
215    #[test]
216    // Test that the number of buckets matches the expected count
217    fn total_buckets() {
218        let config = Config::new(2, 64).unwrap();
219        assert_eq!(config.total_buckets(), 252);
220
221        let config = Config::new(7, 64).unwrap();
222        assert_eq!(config.total_buckets(), 7424);
223
224        let config = Config::new(14, 64).unwrap();
225        assert_eq!(config.total_buckets(), 835_584);
226
227        let config = Config::new(2, 4).unwrap();
228        assert_eq!(config.total_buckets(), 12);
229    }
230
231    #[test]
232    // Test value to index conversions
233    fn value_to_idx() {
234        let config = Config::new(7, 64).unwrap();
235        assert_eq!(config.value_to_index(0), Ok(0));
236        assert_eq!(config.value_to_index(1), Ok(1));
237        assert_eq!(config.value_to_index(256), Ok(256));
238        assert_eq!(config.value_to_index(257), Ok(256));
239        assert_eq!(config.value_to_index(258), Ok(257));
240        assert_eq!(config.value_to_index(512), Ok(384));
241        assert_eq!(config.value_to_index(515), Ok(384));
242        assert_eq!(config.value_to_index(516), Ok(385));
243        assert_eq!(config.value_to_index(1024), Ok(512));
244        assert_eq!(config.value_to_index(1031), Ok(512));
245        assert_eq!(config.value_to_index(1032), Ok(513));
246        assert_eq!(config.value_to_index(u64::MAX - 1), Ok(7423));
247        assert_eq!(config.value_to_index(u64::MAX), Ok(7423));
248    }
249
250    #[test]
251    // Test index to lower bound conversion
252    fn idx_to_lower_bound() {
253        let config = Config::new(7, 64).unwrap();
254        assert_eq!(config.index_to_lower_bound(0), 0);
255        assert_eq!(config.index_to_lower_bound(1), 1);
256        assert_eq!(config.index_to_lower_bound(256), 256);
257        assert_eq!(config.index_to_lower_bound(384), 512);
258        assert_eq!(config.index_to_lower_bound(512), 1024);
259        assert_eq!(
260            config.index_to_lower_bound(7423),
261            18_374_686_479_671_623_680
262        );
263    }
264
265    #[test]
266    // Test index to upper bound conversion
267    fn idx_to_upper_bound() {
268        let config = Config::new(7, 64).unwrap();
269        assert_eq!(config.index_to_upper_bound(0), 0);
270        assert_eq!(config.index_to_upper_bound(1), 1);
271        assert_eq!(config.index_to_upper_bound(256), 257);
272        assert_eq!(config.index_to_upper_bound(384), 515);
273        assert_eq!(config.index_to_upper_bound(512), 1031);
274        assert_eq!(config.index_to_upper_bound(7423), u64::MAX);
275    }
276
277    #[test]
278    // Test index to range conversion
279    fn idx_to_range() {
280        let config = Config::new(7, 64).unwrap();
281        assert_eq!(config.index_to_range(0), 0..=0);
282        assert_eq!(config.index_to_range(1), 1..=1);
283        assert_eq!(config.index_to_range(256), 256..=257);
284        assert_eq!(config.index_to_range(384), 512..=515);
285        assert_eq!(config.index_to_range(512), 1024..=1031);
286        assert_eq!(
287            config.index_to_range(7423),
288            18_374_686_479_671_623_680..=u64::MAX
289        );
290    }
291}