Dynamic Analyze Configuration and Functions

DROP TABLE IF EXISTS dynamic_analyze_config;
CREATE TABLE dynamic_analyze_config (
    schemaname TEXT NOT NULL,      -- Text representation of schema name
    tablename TEXT NOT NULL,       -- Text representation of table name
    columns TEXT[],                -- Array of column names represented as text
                                   --   objects listing the subset of columns
                                   --   to analyze; if this field is empty, all
                                   --   columns will be analyzed
    threshold INTERVAL NOT NULL    -- Interval of time after which statistics
                                   --   for this table are considered stale
)
    WITH (APPENDONLY=FALSE)
    DISTRIBUTED RANDOMLY
;
-- This code is provided on an "as is" basis as an example of pothential
-- functionality, without warntees or conditions of any kind, either express or
-- implied, including, without limitation, any warranties or conditions of
-- title, non-infringement, merchantability, or fitness for a particular
-- purpose. As the user of this code you are solely responsible for determining
-- the appropriateness of using this code and mitigating risks associated with
-- it. 


-- Usage forms:
--
--     SELECT * FROM dynamic_analyze_targeted(schema_name, max_analyzes);
--     - Run ANALYZE against at most max_analyzes tables belonging to the schema
--       named by schema_name that are configured in the dynamic_analyze_config
--       table.  If max_analyzes is not a positive integer, no such limit will
--       be observed.  Table analysis will be prioritized based upon how much
--       time has elapsed since each table has been analyzed relative to the
--       staleness threshold configured in dynamic_analyze_config.
--
--     SELECT * FROM dynamic_analyze_targeted(schema_name, max_analyzes, runtime_limit);
--     - Run ANALYZE against tables belonging to the schema named by schema_name
--       that are configured in the dynamic_analyze_config table until either:
--       1) the number of tables that have been analyzed equals/exceeds
--          max_analyzes (assuming max_analyzes is a positive integer) or
--       2) the total runtime after the completion of the most recent ANALYZE
--          has exceeded the runtime_limit parameter (interpreted as an
--          INTERVAL)
--       As with the other usage form, table analysis will be prioritized based
--       upon how much time has elapsed since each table has been analyzed
--       relative to the staleness threshold configured in
--       dynamic_analyze_config.

CREATE OR REPLACE FUNCTION dynamic_analyze_targeted (
schema_name TEXT, size int 
) RETURNS TABLE(
tablename TEXT,
analyze_start TIMESTAMP,
analyze_stop TIMESTAMP,
time_minutes NUMERIC
) AS $$
BEGIN
FOR tablename, analyze_start, analyze_stop, time_minutes IN
SELECT * FROM dynamic_analyze_targeted (
schema_name, size, '-1 SECONDS'
)
LOOP
RETURN NEXT;
END LOOP;
END;
$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION dynamic_analyze_targeted (
schema_name TEXT, size int, runtime_limit_text TEXT
) RETURNS TABLE(
tablename TEXT,
analyze_start TIMESTAMP,
analyze_stop TIMESTAMP,
time_minutes NUMERIC
) AS $$
DECLARE
cmd TEXT; 
runtime_limit INTERVAL;
start_ts TIMESTAMP;
columns TEXT[];
columnstring TEXT;
tablesdone INTEGER;
BEGIN
tablesdone := 0;
start_ts := CLOCK_TIMESTAMP();
runtime_limit := runtime_limit_text::INTERVAL;
FOR tablename, columns IN
SELECT 
n.nspname||'.'||c.relname,
tgts.columns
FROM

-- Assemble a list of all objects known to the catalog
pg_class c INNER JOIN pg_namespace n
ON (c.relnamespace = n.oid)

-- Determine when each object has last been analyzed
LEFT OUTER JOIN (
SELECT DISTINCT
objid,
statime
FROM
pg_stat_last_operation 
WHERE
staactionname in ('ANALYZE')
) a
ON (a.objid = c.oid)

-- Get a list of objects explicitly named in the dynamic_analyze_config table
LEFT OUTER JOIN dynamic_analyze_config tgts ON (
n.nspname = tgts.schemaname AND
c.relname = tgts.tablename
)


-- Consider only tables in the named schema that have never been analyzed, or
-- for whom the timestamp of the last ANALYZE operation is outside of the
-- staleness interval set in dynamic_analyze_config
WHERE
n.nspname = schema_name
AND (
a.statime IS NULL
OR NOW()-a.statime-tgts.threshold > '0 SECONDS'::INTERVAL
)


-- Consider only tables explicitly named in dynamic_analyze_config
AND tgts.tablename IS NOT NULL

-- Prioritize tables whose statistics are more out-of-date
ORDER BY NOW()-a.statime-tgts.threshold DESC

-- Keep running until no tables remain to analyze, until the runtime
-- threshold has been exceeded, or until the table maximum has been hit.
LOOP
EXIT WHEN runtime_limit > '0 SECONDS'::INTERVAL AND CLOCK_TIMESTAMP() - start_ts > runtime_limit;
EXIT WHEN size > 0 AND tablesdone >= size;

-- Build and execute the analyze statement, taking into account whether a column
-- subset was configured in dynamic_analyze_config.
IF columns IS NULL OR columns = '{}' THEN
columnstring := '';
ELSE
columnstring := '('||array_to_string(columns, ',')||')';
END IF;
cmd := 'ANALYZE '||tablename||' '||columnstring||';';
analyze_start := CLOCK_TIMESTAMP();
EXECUTE cmd;
analyze_stop := CLOCK_TIMESTAMP();
tablesdone := tablesdone + 1;
time_minutes := EXTRACT(EPOCH FROM analyze_stop - analyze_start) / 60.0;
RETURN NEXT;
END LOOP;
END;
$$ LANGUAGE plpgsql;
Comments