聚合直方图
来自 PostgreSQL Wiki
跳转到导航跳转到搜索一个 直方图 代表了一组值的分布。
histogram()
我们编写了一个函数,该函数对每个要聚合的值运行 width_bucket,并使用它来增加相应的桶。状态以整数数组的形式存储。数组有一个索引为零的元素,它保存小于 MIN 的值的计数,以及一个索引为 (nbuckets+1) 的元素,它保存大于或等于 MAX 的值的计数。
此聚合应该适用于 PostgreSQL > 9.0,但它只在 PostgreSQL 9.6 上进行了测试。
CREATE OR REPLACE FUNCTION hist_sfunc (state INTEGER[], val DOUBLE PRECISION,
MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER) RETURNS INTEGER[] AS $$
DECLARE
bucket INTEGER;
i INTEGER;
BEGIN
-- Do nothing if val is NULL
IF val IS NULL THEN
RETURN state;
END IF;
-- This will put values in buckets with a 0 bucket for <MIN and a (nbuckets+1) bucket for >=MAX
bucket := width_bucket(val, MIN, MAX, nbuckets);
-- Init the array with the correct number of 0's so the caller doesn't see NULLs
IF state[0] IS NULL THEN
state := array_fill(0,ARRAY[nbuckets+2],ARRAY[0]);
END IF;
state[bucket] := state[bucket] + 1;
RETURN state;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- Tell Postgres how to use the new function
DROP AGGREGATE IF EXISTS histogram (DOUBLE PRECISION, DOUBLE PRECISION, DOUBLE PRECISION, INTEGER);
CREATE AGGREGATE histogram (val DOUBLE PRECISION, min DOUBLE PRECISION, max DOUBLE PRECISION, nbuckets INTEGER) (
SFUNC = hist_sfunc,
STYPE = INTEGER[],
PARALLEL = SAFE -- Remove line for compatibility with Postgresql < 9.6
);
辅助函数
我们还可以定义一些辅助函数,这些函数给出直方图中桶的中间点、分界点和范围。这些函数需要 PostgreSQL >= 9.5。
CREATE OR REPLACE FUNCTION histogram_ranges(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
RETURNS numrange[] AS
$$
DECLARE
res numrange[];
BEGIN
res := array_agg(numrange(l,u,'[)')) FROM
(SELECT generate_series(MIN::numeric,(MAX-(MAX-MIN)/nbuckets)::numeric,((MAX-MIN)/nbuckets)::numeric) AS l,
generate_series((MIN+(MAX-MIN)/nbuckets)::numeric,MAX::numeric,((MAX-MIN)/nbuckets)::numeric) AS u) t;
res[0] := numrange(NULL,MIN::numeric,'[)');
res[nbuckets+1] := numrange(MAX::numeric,NULL,'[)');
RETURN res;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION histogram_breaks(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
RETURNS DOUBLE PRECISION[] AS
$$
SELECT array(SELECT generate_series(MIN::numeric,MAX::numeric,((MAX-MIN)/nbuckets)::numeric)::DOUBLE PRECISION)
;
$$ LANGUAGE sql IMMUTABLE;
CREATE OR REPLACE FUNCTION histogram_mids(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
RETURNS DOUBLE PRECISION[] AS
$$
SELECT array(SELECT generate_series((MIN + 0.5*((MAX-MIN)/nbuckets))::numeric,
MAX::numeric,
((MAX-MIN)/nbuckets)::numeric)::DOUBLE PRECISION);
$$ LANGUAGE sql IMMUTABLE;
示例用法
WITH a AS (
SELECT generate_series(-2,5,0.5) AS i
)
SELECT array_agg(i) AS values,
histogram_mids(0,3,3) AS mids,
histogram_breaks(0,3,3) AS breaks,
histogram_ranges(0,3,3) AS ranges,
histogram(i,0,3,3) AS counts,
(histogram_ranges(0,3,3))[1:3] AS ranges_in_limits,
(histogram(i,0,3,3))[1:3] AS counts_in_limits
FROM a;
注意事项
如果参数是 NULL 值的列,则返回 NULL。