Basic Aggregates (24 Functions)
Single-column aggregate functions. Used in the form SELECT stat_xxx(column) FROM table. Available in all contexts where SQLite3 standard aggregate functions can be used, including GROUP BY, HAVING, subqueries, etc.
Basic Statistics
stat_mean
Computes the arithmetic mean.
$$\bar{x} = \frac{1}{n}\sum_{i=1}^{n} x_i$$
Syntax: stat_mean(column)
-- Basic usage
SELECT stat_mean(score) FROM students;
-- Average salary by department
SELECT department, stat_mean(salary) AS avg_salary
FROM employees
GROUP BY department;
-- Calculate difference from mean using subquery
SELECT name, score,
score - (SELECT stat_mean(score) FROM students) AS diff_from_mean
FROM students;
-- Extract groups with mean above threshold using HAVING
SELECT class, stat_mean(score) AS avg_score
FROM students
GROUP BY class
HAVING stat_mean(score) >= 70;
-- Comparison with SQLite3 built-in AVG()
SELECT stat_mean(val) AS stat_mean_result,
AVG(val) AS sqlite_avg_result
FROM data;
-- → Both return the same result
stat_median
Computes the median. The middle value when data is sorted. For an even number of elements, returns the average of the two middle values.
Syntax: stat_median(column)
-- Basic usage
SELECT stat_median(price) FROM products;
-- Compare mean and median to check distribution skewness
SELECT stat_mean(income) AS mean_income,
stat_median(income) AS median_income
FROM households;
-- mean > median → right-skewed distribution (influence of high earners)
-- mean ≈ median → approximately symmetric distribution
-- Median by category
SELECT region,
stat_median(sales) AS median_sales
FROM store_data
GROUP BY region
ORDER BY median_sales DESC;
-- Median as a representative value robust to outliers
-- Example: [1, 2, 3, 4, 1000] → mean=202, median=3
CREATE TABLE outlier_data (val REAL);
INSERT INTO outlier_data VALUES (1),(2),(3),(4),(1000);
SELECT stat_mean(val) AS mean, -- 202.0
stat_median(val) AS median -- 3.0
FROM outlier_data;
stat_mode
Returns the mode (the most frequently occurring value). If there are multiple modes, returns the smallest value.
Syntax: stat_mode(column)
-- Basic usage
SELECT stat_mode(shoe_size) FROM customers;
-- Compare mode with mean and median
SELECT stat_mode(rating) AS mode_rating,
stat_mean(rating) AS mean_rating,
stat_median(rating) AS median_rating
FROM reviews;
-- Mode by group
SELECT category, stat_mode(color) AS most_common_color
FROM products
GROUP BY category;
-- If all values are unique, returns the smallest value
CREATE TABLE unique_data (val REAL);
INSERT INTO unique_data VALUES (5),(3),(1),(4),(2);
SELECT stat_mode(val) FROM unique_data;
-- → 1.0 (all values appear once, so returns the smallest)
stat_geometric_mean
Computes the geometric mean. Suitable for averaging growth rates and ratios.
$$\bar{x}g = \left(\prod{i=1}^{n} x_i\right)^{1/n}$$
Syntax: stat_geometric_mean(column)
Constraint: All values must be positive.
-- Average annual growth rate of an investment
-- 3-year growth rates: 1.10 (10%), 1.05 (5%), 1.20 (20%)
CREATE TABLE growth (rate REAL);
INSERT INTO growth VALUES (1.10),(1.05),(1.20);
SELECT stat_geometric_mean(rate) AS avg_growth_rate;
-- → 1.11535... (average +11.5%)
-- Difference from arithmetic mean
SELECT stat_mean(rate) AS arithmetic_mean, -- 1.1167
stat_geometric_mean(rate) AS geometric_mean -- 1.1154
FROM growth;
-- Geometric mean is always ≤ arithmetic mean (AM-GM inequality)
-- Average multiplication factor of bacterial growth
SELECT stat_geometric_mean(multiplication_factor) AS avg_factor
FROM bacteria_growth;
stat_harmonic_mean
Computes the harmonic mean. Suitable for averaging speeds and ratios.
$$\bar{x}h = \frac{n}{\sum{i=1}^{n} \frac{1}{x_i}}$$
Syntax: stat_harmonic_mean(column)
Constraint: All values must be non-zero.
-- Average speed for a round trip
-- Outbound: 60 km/h, return: 40 km/h
CREATE TABLE speed (kmph REAL);
INSERT INTO speed VALUES (60),(40);
SELECT stat_harmonic_mean(kmph) AS avg_speed;
-- → 48.0 km/h (less than arithmetic mean of 50; this is the correct average speed)
-- Comparison of three types of means
SELECT stat_mean(val) AS arithmetic,
stat_geometric_mean(val) AS geometric,
stat_harmonic_mean(val) AS harmonic
FROM positive_data;
-- Always: harmonic ≤ geometric ≤ arithmetic
-- Average price-to-earnings ratio (P/E ratio)
SELECT stat_harmonic_mean(pe_ratio) AS avg_pe
FROM stocks
WHERE pe_ratio > 0;
Dispersion / Variability
stat_range
Computes the range (maximum - minimum).
Syntax: stat_range(column)
-- Basic usage
SELECT stat_range(temperature) AS temp_range FROM weather;
-- Compare range across groups
SELECT sensor_id,
MIN(reading) AS min_val,
MAX(reading) AS max_val,
stat_range(reading) AS range_val
FROM sensor_data
GROUP BY sensor_id;
-- Quality control: detect batches where range exceeds threshold
SELECT batch_id, stat_range(measurement) AS r
FROM quality_data
GROUP BY batch_id
HAVING stat_range(measurement) > 5.0;
stat_var
Computes the variance (population variance, ddof=0).
$$\sigma^2 = \frac{1}{n}\sum_{i=1}^{n}(x_i - \bar{x})^2$$
Syntax: stat_var(column)
Note:
stat_varreturns population variance (ddof=0). Usestat_sample_varianceif you need sample variance (unbiased).
-- Population variance
SELECT stat_var(score) AS variance FROM exam;
-- Comparison of population and sample variance
SELECT stat_var(score) AS pop_var, -- divides by N
stat_sample_variance(score) AS sample_var -- divides by N-1
FROM exam;
stat_population_variance
Computes the population variance. Equivalent to stat_var.
$$\sigma^2 = \frac{1}{N}\sum_{i=1}^{N}(x_i - \mu)^2$$
Syntax: stat_population_variance(column)
-- Use when working with complete population data (census)
SELECT stat_population_variance(height) FROM census_data;
-- Quality metric for all products on a production line
SELECT production_line,
stat_population_variance(weight) AS var_weight
FROM all_products
GROUP BY production_line;
stat_sample_variance
Computes the sample variance (unbiased variance).
$$s^2 = \frac{1}{n-1}\sum_{i=1}^{n}(x_i - \bar{x})^2$$
Syntax: stat_sample_variance(column)
Minimum data count: 2
-- Use when estimating population variance from a sample
SELECT stat_sample_variance(weight) FROM sample_products;
-- Compare variance between two groups
SELECT group_name,
stat_sample_variance(score) AS var_score
FROM experiment
GROUP BY group_name;
-- Identify the category with the largest variance
SELECT category, stat_sample_variance(price) AS var_price
FROM products
GROUP BY category
ORDER BY var_price DESC
LIMIT 1;
stat_stdev
Computes the standard deviation (population standard deviation, ddof=0).
$$\sigma = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(x_i - \bar{x})^2}$$
Syntax: stat_stdev(column)
Note:
stat_stdevreturns population standard deviation (ddof=0). Usestat_sample_stddevif you need sample standard deviation.
-- Population standard deviation
SELECT stat_stdev(score) AS pop_stddev FROM exam;
-- Check the proportion within mean ± 1 sigma
SELECT
CAST(COUNT(CASE
WHEN score BETWEEN
(SELECT stat_mean(score) FROM exam) - (SELECT stat_stdev(score) FROM exam)
AND (SELECT stat_mean(score) FROM exam) + (SELECT stat_stdev(score) FROM exam)
THEN 1 END) AS REAL) / COUNT(*) * 100 AS pct_within_1sigma
FROM exam;
-- Approximately 68% for a normal distribution
stat_population_stddev
Computes the population standard deviation. Equivalent to stat_stdev.
Syntax: stat_population_stddev(column)
SELECT stat_population_stddev(measurement) FROM all_measurements;
stat_sample_stddev
Computes the sample standard deviation.
$$s = \sqrt{\frac{1}{n-1}\sum_{i=1}^{n}(x_i - \bar{x})^2}$$
Syntax: stat_sample_stddev(column)
Minimum data count: 2
-- Sample standard deviation
SELECT stat_sample_stddev(weight) FROM sample_data;
-- Get mean and standard deviation at once
SELECT stat_mean(score) AS mean,
stat_sample_stddev(score) AS stddev
FROM test_scores;
-- Compute descriptive statistics by group in one query
SELECT department,
COUNT(salary) AS n,
stat_mean(salary) AS mean,
stat_sample_stddev(salary) AS stddev,
stat_median(salary) AS median,
MIN(salary) AS min,
MAX(salary) AS max
FROM employees
GROUP BY department;
stat_cv
Computes the coefficient of variation. The sample standard deviation divided by the absolute value of the mean, enabling comparison of variability across data with different units.
$$CV = \frac{s}{|\bar{x}|}$$
Syntax: stat_cv(column)
Minimum data count: 2
-- Coefficient of variation
SELECT stat_cv(height) AS cv_height,
stat_cv(weight) AS cv_weight
FROM people;
-- Enables comparison of which has relatively more variability: height or weight
-- Higher CV indicates greater variability
SELECT product_type,
stat_mean(price) AS mean_price,
stat_cv(price) AS cv_price
FROM products
GROUP BY product_type
ORDER BY cv_price DESC;
-- Used as a stability indicator
-- CV < 0.1: very stable, CV > 0.3: high variability
SELECT sensor_id, stat_cv(reading) AS stability
FROM readings
GROUP BY sensor_id
HAVING stat_cv(reading) > 0.3;
stat_iqr
Computes the interquartile range (IQR). Q3 - Q1.
Syntax: stat_iqr(column)
-- Interquartile range
SELECT stat_iqr(score) AS iqr FROM exam;
-- IQR as a criterion for outlier detection
-- Common rule: below Q1 - 1.5*IQR or above Q3 + 1.5*IQR
-- * For Q1 and Q3 computation, see stat_quartile
-- Summarize distribution with median and IQR (descriptive statistics robust to outliers)
SELECT stat_median(price) AS median_price,
stat_iqr(price) AS iqr_price
FROM listings;
-- Compare IQR across groups
SELECT category,
stat_iqr(duration) AS iqr_duration
FROM events
GROUP BY category;
stat_mad_mean
Computes the mean absolute deviation (Mean Absolute Deviation from mean).
$$MAD_{mean} = \frac{1}{n}\sum_{i=1}^{n}|x_i - \bar{x}|$$
Syntax: stat_mad_mean(column)
-- Mean absolute deviation
SELECT stat_mad_mean(score) AS mad_from_mean FROM exam;
-- Comparison with standard deviation
-- Mean absolute deviation is less sensitive to outliers
SELECT stat_mad_mean(val) AS mad_mean,
stat_stdev(val) AS stdev
FROM data;
-- Compare variability across groups
SELECT region,
stat_mad_mean(delivery_time) AS avg_deviation
FROM orders
GROUP BY region;
stat_geometric_stddev
Computes the geometric standard deviation. A measure of variability for data following a log-normal distribution.
Syntax: stat_geometric_stddev(column)
Constraint: All values must be positive.
-- Geometric standard deviation
SELECT stat_geometric_mean(concentration) AS geo_mean,
stat_geometric_stddev(concentration) AS geo_stddev
FROM environmental_samples;
-- Useful for log-normally distributed data (particle sizes, concentrations, income, etc.)
-- Geometric mean × geometric stddev^±1 roughly corresponds to the 68% range
SELECT stat_geometric_mean(particle_size) AS gm,
stat_geometric_stddev(particle_size) AS gsd
FROM aerosol_data;
Distribution Shape
stat_population_skewness
Computes the population skewness. A measure of distribution asymmetry.
$$g_1 = \frac{\frac{1}{N}\sum_{i=1}^{N}(x_i - \mu)^3}{\sigma^3}$$
Syntax: stat_population_skewness(column)
Minimum data count: 3
-- Interpreting skewness
SELECT stat_population_skewness(income) AS skewness FROM survey;
-- skewness > 0: right-tailed (positive skew) — common in income distributions
-- skewness = 0: symmetric (close to normal distribution)
-- skewness < 0: left-tailed (negative skew)
-- Review distribution shape at a glance
SELECT stat_mean(val) AS mean,
stat_median(val) AS median,
stat_population_skewness(val) AS skewness,
stat_population_kurtosis(val) AS kurtosis
FROM measurements;
stat_skewness
Computes the sample skewness (with Fisher's correction).
$$G_1 = \frac{n}{(n-1)(n-2)} \cdot \frac{\sum_{i=1}^{n}(x_i - \bar{x})^3}{s^3}$$
Syntax: stat_skewness(column)
Minimum data count: 3
-- Sample skewness (estimate population skewness from a sample)
SELECT stat_skewness(response_time) AS skewness FROM api_logs;
-- Comparison of population and sample skewness
SELECT stat_population_skewness(val) AS pop_skew,
stat_skewness(val) AS sample_skew
FROM data;
-- The two converge as n increases
-- Quick normality check
-- If |skewness| < 2 and |kurtosis| < 7, roughly considered normal
SELECT stat_skewness(val) AS skew,
stat_kurtosis(val) AS kurt
FROM data;
stat_population_kurtosis
Computes the population kurtosis (excess kurtosis). A measure of peakedness relative to the normal distribution (baseline = 0).
$$g_2 = \frac{\frac{1}{N}\sum_{i=1}^{N}(x_i - \mu)^4}{\sigma^4} - 3$$
Syntax: stat_population_kurtosis(column)
Minimum data count: 4
-- Interpreting kurtosis
SELECT stat_population_kurtosis(val) AS kurtosis FROM data;
-- kurtosis > 0: peaked distribution (heavy tails) — leptokurtic
-- kurtosis = 0: same as normal distribution — mesokurtic
-- kurtosis < 0: flat distribution (light tails) — platykurtic
-- Risk assessment of financial data
-- High kurtosis → extreme values (crashes/spikes) are more likely
SELECT asset_name,
stat_population_kurtosis(daily_return) AS kurtosis
FROM returns
GROUP BY asset_name
ORDER BY kurtosis DESC;
stat_kurtosis
Computes the sample kurtosis (excess kurtosis, with Fisher's correction).
Syntax: stat_kurtosis(column)
Minimum data count: 4
-- Sample kurtosis
SELECT stat_kurtosis(val) AS kurtosis FROM sample_data;
-- Complete descriptive statistics of distribution shape
SELECT stat_mean(val) AS mean,
stat_median(val) AS median,
stat_stdev(val) AS stddev,
stat_skewness(val) AS skewness,
stat_kurtosis(val) AS kurtosis
FROM sample_data;
Estimation
stat_se
Computes the standard error (Standard Error of the Mean). A measure of the precision of the mean estimate.
$$SE = \frac{s}{\sqrt{n}}$$
Syntax: stat_se(column)
Minimum data count: 2
-- Standard error
SELECT stat_se(score) AS se FROM exam;
-- Approximate 95% confidence interval for the mean (95% CI ≈ mean ± 1.96 * SE)
SELECT stat_mean(val) AS mean,
stat_se(val) AS se,
stat_mean(val) - 1.96 * stat_se(val) AS ci_lower_95,
stat_mean(val) + 1.96 * stat_se(val) AS ci_upper_95
FROM large_sample;
-- Note: This is an approximation valid for sufficiently large n. For exact CI, see stat_ci_mean.
-- Relationship between sample size and standard error
SELECT COUNT(val) AS n,
stat_sample_stddev(val) AS stddev,
stat_se(val) AS se
FROM data;
-- When n quadruples, SE halves
-- Compare estimation precision across groups
SELECT group_name,
COUNT(score) AS n,
stat_mean(score) AS mean,
stat_se(score) AS se
FROM experiment
GROUP BY group_name;
Robust Statistics
stat_mad
Computes the median absolute deviation (MAD). A measure of variability that is highly robust to outliers.
$$MAD = \text{median}(|x_i - \tilde{x}|)$$
where $\tilde{x}$ is the median of the data.
Syntax: stat_mad(column)
-- MAD (variability measure robust to outliers)
SELECT stat_mad(val) AS mad FROM data;
-- Comparison with standard deviation (when outliers are present)
CREATE TABLE with_outlier (val REAL);
INSERT INTO with_outlier VALUES (1),(2),(3),(4),(5),(100);
SELECT stat_stdev(val) AS stdev, -- inflated by outlier
stat_mad(val) AS mad -- less affected by outlier
FROM with_outlier;
-- Using MAD as a criterion for outlier detection via Modified Z-score
-- Modified Z-score = 0.6745 * (x - median) / MAD
-- Consider as outlier if |Modified Z-score| > 3.5
SELECT val,
0.6745 * (val - (SELECT stat_median(val) FROM data))
/ (SELECT stat_mad(val) FROM data) AS modified_z
FROM data;
stat_mad_scaled
Computes the scaled MAD. MAD corrected to be on the same scale as the standard deviation under a normal distribution.
$$MAD_{scaled} = 1.4826 \times MAD$$
Syntax: stat_mad_scaled(column)
-- Under a normal distribution, close to stat_sample_stddev
SELECT stat_sample_stddev(val) AS stddev,
stat_mad_scaled(val) AS mad_scaled
FROM normal_data;
-- When outliers are present, MAD_scaled is more robust
SELECT stat_sample_stddev(val) AS stddev, -- inflated by outliers
stat_mad_scaled(val) AS mad_scaled -- robust
FROM data_with_outliers;
-- Use in quality control
SELECT batch_id,
stat_median(weight) AS center,
stat_mad_scaled(weight) AS spread
FROM production
GROUP BY batch_id;
stat_hodges_lehmann
Computes the Hodges-Lehmann estimator. The median of all pairwise means. A robust estimator of location.
$$\hat{\theta}_{HL} = \text{median}\left{\frac{x_i + x_j}{2} : 1 \le i \le j \le n\right}$$
Syntax: stat_hodges_lehmann(column)
-- Hodges-Lehmann estimator (robust location estimate)
SELECT stat_hodges_lehmann(val) AS hl_estimate FROM data;
-- Comparison with mean and median
SELECT stat_mean(val) AS mean,
stat_median(val) AS median,
stat_hodges_lehmann(val) AS hodges_lehmann
FROM data;
-- Comparison when outliers are present
CREATE TABLE hl_test (val REAL);
INSERT INTO hl_test VALUES (1),(2),(3),(4),(5),(100);
SELECT stat_mean(val) AS mean, -- 19.17 (pulled by outlier)
stat_median(val) AS median, -- 3.5
stat_hodges_lehmann(val) AS hodges_lehmann -- robustness between mean and median
FROM hl_test;
-- Also used as the point estimate for the Wilcoxon signed-rank test
SELECT stat_hodges_lehmann(val) AS location_estimate
FROM treatment_effects;