聚合直方图

来自 PostgreSQL Wiki
跳转到导航跳转到搜索

一个 直方图 代表了一组值的分布。

histogram()

代码片段

聚合直方图

适用于 PostgreSQL

任何版本

编写语言

PLPGSQL

依赖于


我们编写了一个函数,该函数对每个要聚合的值运行 width_bucket,并使用它来增加相应的桶。状态以整数数组的形式存储。数组有一个索引为零的元素,它保存小于 MIN 的值的计数,以及一个索引为 (nbuckets+1) 的元素,它保存大于或等于 MAX 的值的计数。

此聚合应该适用于 PostgreSQL > 9.0,但它只在 PostgreSQL 9.6 上进行了测试。

CREATE OR REPLACE FUNCTION hist_sfunc (state INTEGER[], val DOUBLE PRECISION,
       MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER) RETURNS INTEGER[] AS $$
DECLARE
  bucket INTEGER;
  i INTEGER;
BEGIN
  -- Do nothing if val is NULL
  IF val IS NULL THEN
     RETURN state;
  END IF;

  -- This will put values in buckets with a 0 bucket for <MIN and a (nbuckets+1) bucket for >=MAX
  bucket := width_bucket(val, MIN, MAX, nbuckets);
 
  -- Init the array with the correct number of 0's so the caller doesn't see NULLs
  IF state[0] IS NULL THEN
    state := array_fill(0,ARRAY[nbuckets+2],ARRAY[0]);
  END IF;

  state[bucket] := state[bucket] + 1;
 
  RETURN state;
END;
$$ LANGUAGE plpgsql IMMUTABLE;

-- Tell Postgres how to use the new function
DROP AGGREGATE IF EXISTS histogram (DOUBLE PRECISION, DOUBLE PRECISION, DOUBLE PRECISION, INTEGER);
CREATE AGGREGATE histogram (val DOUBLE PRECISION, min DOUBLE PRECISION, max DOUBLE PRECISION, nbuckets INTEGER) (
       SFUNC = hist_sfunc,
       STYPE = INTEGER[],
       PARALLEL = SAFE -- Remove line for compatibility with  Postgresql < 9.6
);

辅助函数

我们还可以定义一些辅助函数,这些函数给出直方图中桶的中间点、分界点和范围。这些函数需要 PostgreSQL >= 9.5。

CREATE OR REPLACE FUNCTION histogram_ranges(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
       RETURNS numrange[] AS
$$
DECLARE
	res numrange[];
BEGIN
	res := array_agg(numrange(l,u,'[)')) FROM
	(SELECT generate_series(MIN::numeric,(MAX-(MAX-MIN)/nbuckets)::numeric,((MAX-MIN)/nbuckets)::numeric) AS l,
       		generate_series((MIN+(MAX-MIN)/nbuckets)::numeric,MAX::numeric,((MAX-MIN)/nbuckets)::numeric) AS u) t;

	res[0] := numrange(NULL,MIN::numeric,'[)');
	res[nbuckets+1] := numrange(MAX::numeric,NULL,'[)');

	RETURN res;
END;
$$ LANGUAGE plpgsql IMMUTABLE;

CREATE OR REPLACE FUNCTION histogram_breaks(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
       RETURNS DOUBLE PRECISION[] AS
$$
SELECT array(SELECT generate_series(MIN::numeric,MAX::numeric,((MAX-MIN)/nbuckets)::numeric)::DOUBLE PRECISION)
;
$$ LANGUAGE sql IMMUTABLE;

CREATE OR REPLACE FUNCTION histogram_mids(MIN DOUBLE PRECISION, MAX DOUBLE PRECISION, nbuckets INTEGER)
       RETURNS DOUBLE PRECISION[] AS
$$
SELECT array(SELECT generate_series((MIN + 0.5*((MAX-MIN)/nbuckets))::numeric,
       		    	            MAX::numeric,
				    ((MAX-MIN)/nbuckets)::numeric)::DOUBLE PRECISION);
$$ LANGUAGE sql IMMUTABLE;


示例用法

WITH a AS (
SELECT generate_series(-2,5,0.5) AS i
)
SELECT array_agg(i) AS values,
       histogram_mids(0,3,3) AS mids,
       histogram_breaks(0,3,3) AS breaks,
       histogram_ranges(0,3,3) AS ranges,
       histogram(i,0,3,3) AS counts,
       (histogram_ranges(0,3,3))[1:3] AS ranges_in_limits,
       (histogram(i,0,3,3))[1:3] AS counts_in_limits
FROM a;

注意事项

如果参数是 NULL 值的列,则返回 NULL。

另请参阅