Smart Analyze Greenplum Tables

Step 1: create config table

[gpadmin@sachi ~]$ psql -d sachi
psql (8.2.15)
Type "help" for help.

sachi=# 
CREATE TABLE dbadmin.smart_analyze_config (
schemaname TEXT NOT NULL,      -- schema name
tablename TEXT NOT NULL,          --  table name
columns TEXT,  -- Comma separated list of columns to be analyzed. Default is distribution key. If this field is RANDOM or empty, all columns will be analyzed
threshold INTERVAL NOT NULL     -- Interval (in hours) of time after which statistics for this table are considered stale
)
WITH (APPENDONLY=FALSE)
DISTRIBUTED RANDOMLY;

Version 2 -> Add relpages and reltuples in the table.

sachi=# 
CREATE TABLE dbadmin.smart_analyze_config (
schemaname TEXT NOT NULL,      -- schema name
tablename TEXT NOT NULL,          --  table name
columns TEXT,  -- Comma separated list of columns to be analyzed. Default is distribution key. If this field is RANDOM or empty, all columns will be analyzed
threshold INTERVAL NOT NULL,     -- Interval (in hours) of time after which statistics for this table are considered stale
relpages integer not null,
reltuples  real not null
)
WITH (APPENDONLY=FALSE)
DISTRIBUTED RANDOMLY;

Step 2: Insert meta data in the table

insert into dbadmin.smart_analyze_config (schemaname,tablename,columns,threshold)
SELECT pgn.nspname , 
pgc.relname , 
pga.attname ,
'24 hours'
FROM 
(SELECT gdp.localoid, 
CASE WHEN ( Array_upper(gdp.attrnums, 1) > 0 ) THEN Unnest(gdp.attrnums) ELSE NULL END AS attnum 
FROM gp_distribution_policy gdp 
ORDER BY gdp.localoid) AS distrokey 
INNER JOIN pg_class AS pgc ON distrokey.localoid = pgc.oid 
INNER JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid 
AND pgn.nspname NOT IN ('gp_toolkit', 'information_schema', 'pg_catalog') 
AND pgc.relkind='r'
AND pgc.relstorage!='x'
LEFT OUTER JOIN pg_attribute pga ON distrokey.attnum = pga.attnum 
AND distrokey.localoid = pga.attrelid 
ORDER BY pgn.nspname, pgc.relname;

Version 2 -> Add relpages and reltuples in the table.

insert into smart_analyze_config (schemaname,tablename,columns,threshold,relpages,reltuples) 
SELECT 
pgn.nspname , 
 pgc.relname , 
 pga.attname , 
 '72 hours', 
 pgc.relpages, 
 pgc.reltuples 
FROM 
 (SELECT gdp.localoid, 
 CASE WHEN ( Array_upper(gdp.attrnums, 1) > 0 ) THEN Unnest(gdp.attrnums) ELSE NULL END AS attnum 
 FROM gp_distribution_policy gdp 
 ORDER BY gdp.localoid) AS distrokey 
 INNER JOIN pg_class AS pgc ON distrokey.localoid = pgc.oid 
 INNER JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid 
 AND pgn.nspname NOT IN ('gp_toolkit', 'information_schema', 'pg_catalog') 
 AND pgc.relkind='r' AND pgc.relstorage!='x' 
 LEFT OUTER JOIN pg_attribute pga ON distrokey.attnum = pga.attnum 
 AND distrokey.localoid = pga.attrelid ORDER BY pgn.nspname, pgc.relname;

INSERT 0 275
Time: 103.499 ms


Step 3: Create smart analyze SQL and look at the output. make any corrections if you see something not right. 

select schemaname,tablename,lastnalyzed,
CASE
WHEN columns is NULL THEN 'ANALYZE '||'"'||schemaname||'"'||'.'||'"'||tablename||'"'||';'
ELSE 'ANALYZE '||'"'||schemaname||'"'||'.'||'"'||tablename||'"'||'('||columns||')'||';'  END as analyzecommand
 from
 (
 select pso.schemaname,pso.objname as tablename,columns,max(pso.statime) as lastnalyzed,max(sac.threshold) as thresholdinterval
 from pg_stat_operations pso ,smart_analyze_config sac
 where pso.schemaname=sac.schemaname
 and pso.objname=sac.tablename
 and pso.actionname='ANALYZE'
 group by 1,2,3
 order by 3 desc                                                                                                                                              
) as a
where now()-lastnalyzed>thresholdinterval
order by 3 asc;

Step 4: Save above SQL in a file smartanalyze.sql 

Step 6: From the gpadmin login run following command

[gpadmin@sachi scripts]$ psql -d sachi -Alt -f smartanalyze.sql|awk '{ print $8,$9 }'|psql -X -d sachi
Comments