diff --git a/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/config/grafana/dashboards/Dashboard_13_Lock_waits.json new file mode 100644 index 0000000000000000000000000000000000000000..8ad70b8819070298d195ce697aa64ec5023b0b6e --- /dev/null +++ b/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -0,0 +1,1113 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "Lock blocks overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Number of active sessions waiting on heavyweight lock acquisition.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "count(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"}) or vector(0)", + "legendFormat": "Lock blocks", + "range": true, + "refId": "A" + } + ], + "title": "Lock blocks count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Lock block duration over time, showing how long processes have been blocked by locks.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "avg(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Average wait time", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "max(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Max wait time", + "range": true, + "refId": "B" + } + ], + "title": "Lock block duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Lock blocks by lock type, showing which types of locks are causing the most contention.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "count by (blocked_locktype) (pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "{{blocked_locktype}}", + "range": true, + "refId": "A" + } + ], + "title": "Lock blocks by lock type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Blocker transaction duration over time, showing how long blocking transactions have been running.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5000 + }, + { + "color": "red", + "value": 30000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "avg(pgwatch_lock_waits_blocker_tx_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Average blocker tx duration", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "max(pgwatch_lock_waits_blocker_tx_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Max blocker tx duration", + "range": true, + "refId": "B" + } + ], + "title": "Blocker transaction duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Lock blocks by table, showing which tables have the most lock contention.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "count by (blocked_table) (pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", blocked_table!=\"\"})", + "legendFormat": "{{blocked_table}}", + "range": true, + "refId": "A" + } + ], + "title": "Lock blocks by table", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 11, + "panels": [], + "title": "Lock blocks details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Detailed lock block information showing blocked and blocking processes with their users, applications, lock types, tables, query IDs, and block durations.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "blocked_pid" + }, + "properties": [ + { + "id": "custom.width", + "value": 90 + }, + { + "id": "displayName", + "value": "Blocked PID" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_pid" + }, + "properties": [ + { + "id": "custom.width", + "value": 90 + }, + { + "id": "displayName", + "value": "Blocker PID" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.width", + "value": 160 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "unit", + "value": "ms" + }, + { + "id": "displayName", + "value": "Blocked ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_user" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "displayName", + "value": "Blocked User" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_user" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "displayName", + "value": "Blocker User" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_appname" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Blocked App" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_appname" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Blocker App" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_locktype" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocked Lock Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_locktype" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocker Lock Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_mode" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocked Mode" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_mode" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocker Mode" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_table" + }, + "properties": [ + { + "id": "custom.width", + "value": 130 + }, + { + "id": "displayName", + "value": "Blocked Table" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_table" + }, + "properties": [ + { + "id": "custom.width", + "value": 130 + }, + { + "id": "displayName", + "value": "Blocker Table" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "datname" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Database" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_query_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + }, + { + "id": "displayName", + "value": "Blocked Query ID" + }, + { + "id": "links", + "value": [ + { + "title": "View query analysis", + "url": "/d/db52944d-b025-4e18-b70b-89c0af3e7e41/03-single-queryid-analysis?var-cluster_name=${cluster_name}&var-node_name=${node_name}&var-db_name=${db_name}&var-query_id=${__value.raw}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_query_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + }, + { + "id": "displayName", + "value": "Blocker Query ID" + }, + { + "id": "links", + "value": [ + { + "title": "View query analysis", + "url": "/d/db52944d-b025-4e18-b70b-89c0af3e7e41/03-single-queryid-analysis?var-cluster_name=${cluster_name}&var-node_name=${node_name}&var-db_name=${db_name}&var-query_id=${__value.raw}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 16, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Blocked ms" + } + ] + }, + "pluginVersion": "10.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "exemplar": false, + "expr": "pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"}", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Lock blocks details", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "cluster": true, + "dbname": true, + "env": true, + "instance": true, + "job": true, + "node_name": true, + "real_dbname": true, + "sink_type": true, + "sys_id": true + }, + "indexByName": { + "Time": 0, + "datname": 1, + "blocked_pid": 2, + "blocked_user": 3, + "blocked_appname": 4, + "blocked_locktype": 5, + "blocked_mode": 6, + "blocked_table": 7, + "blocked_query_id": 8, + "Value": 9, + "blocker_pid": 10, + "blocker_user": 11, + "blocker_appname": 12, + "blocker_locktype": 13, + "blocker_mode": 14, + "blocker_table": 15, + "blocker_query_id": 16 + }, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b,cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "node_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "db_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m" + ] + }, + "timezone": "", + "title": "13. Lock blocks details", + "uid": "lock-waits-details", + "version": 1, + "weekStart": "" +} diff --git a/config/pgwatch-postgres/metrics.yml b/config/pgwatch-postgres/metrics.yml index 912e740fcb1b21b02f5c024124dc669447b2b62d..cecda732e197f33e2798cb1b9916f9b8b3116c0d 100644 --- a/config/pgwatch-postgres/metrics.yml +++ b/config/pgwatch-postgres/metrics.yml @@ -12,7 +12,6 @@ metrics: queryid is not null and dbid = (select oid from pg_database where datname = current_database()) order by total_exec_time desc - limit 1000; gauges: - '*' @@ -198,5 +197,5 @@ presets: full: description: "Full metrics for PostgreSQL storage" metrics: - pgss_queryid_queries: 300 + pgss_queryid_queries: 30 index_definitions: 3600 \ No newline at end of file diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 7693c27f3d228ecc2c3e70fde96642bf3b3e75ba..1d6d12ff9da9f020f3e693608041a2478ded4bdb 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -819,6 +819,71 @@ metrics: gauges: - queries statement_timeout_seconds: 15 + lock_waits: + description: > + Retrieves detailed information about lock waits, including blocked and blocking processes with their queries, users, and application names. + It returns blocked and blocker process IDs, lock modes and types, affected tables, queries, and wait/transaction durations. + This metric helps administrators identify and diagnose lock contention issues in detail. + sqls: + 14: |- + with sa_snapshot as ( /* pgwatch_generated */ + select * + from pg_stat_activity + where + datname = current_database() + and pid <> pg_backend_pid() + and state in ('active', 'idle in transaction', 'idle in transaction (aborted)') + ), + pid_tables as ( + select distinct on (pid) pid, relation::regclass::text as table_name + from pg_catalog.pg_locks + where relation is not null + and locktype in ('tuple', 'relation') + and relation::regclass::text not like '%_pkey' + and relation::regclass::text not like '%_idx' + order by pid, locktype + ) + select + blocked.pid as blocked_pid, + current_database() as tag_datname, + blocked_stm.usename::text as tag_blocked_user, + blocked_stm.application_name::text as tag_blocked_appname, + blocked.mode as blocked_mode, + blocked.locktype as blocked_locktype, + coalesce(blocked.relation::regclass::text, blocked_tbl.table_name, '') as tag_blocked_table, + blocked_stm.query_id::text as tag_blocked_query_id, + (extract(epoch from (clock_timestamp() - blocked_stm.state_change)) * 1000)::bigint as blocked_ms, + blocker.pid as blocker_pid, + blocker_stm.usename::text as tag_blocker_user, + blocker_stm.application_name::text as tag_blocker_appname, + blocker.mode as blocker_mode, + blocker.locktype as blocker_locktype, + coalesce(blocker.relation::regclass::text, blocker_tbl.table_name, '') as tag_blocker_table, + blocker_stm.query_id::text as tag_blocker_query_id, + (extract(epoch from (clock_timestamp() - blocker_stm.xact_start)) * 1000)::bigint as blocker_tx_ms + from pg_catalog.pg_locks as blocked + join sa_snapshot as blocked_stm on blocked_stm.pid = blocked.pid + join pg_catalog.pg_locks as blocker on + blocked.pid <> blocker.pid + and blocker.granted + and ( + (blocked.database = blocker.database) + or (blocked.database is null and blocker.database is null) + ) + and ( + blocked.relation = blocker.relation + or blocked.transactionid = blocker.transactionid + ) + join sa_snapshot as blocker_stm on blocker_stm.pid = blocker.pid + left join pid_tables as blocked_tbl on blocked_tbl.pid = blocked.pid + left join pid_tables as blocker_tbl on blocker_tbl.pid = blocker.pid + where not blocked.granted + order by blocked_ms desc + limit 10000 + gauges: + - blocked_ms + - blocker_tx_ms + statement_timeout_seconds: 15 pg_database_wraparound: sqls: 11: | @@ -2335,6 +2400,7 @@ presets: pg_statio_all_tables: 30 pg_statio_all_indexes: 30 pg_total_relation_size: 30 + lock_waits: 30 pg_blocked: 30 pg_long_running_transactions: 30 pg_stuck_idle_in_transaction: 30 diff --git a/tests/lock_waits/README.md b/tests/lock_waits/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3838b5c08414b6dd050eaf131029df3ff3c0cd94 --- /dev/null +++ b/tests/lock_waits/README.md @@ -0,0 +1,226 @@ +# Lock Waits Metric Testing + +This directory contains tests and scripts to verify that the `lock_waits` metric is working correctly. + +## Overview + +The `lock_waits` metric collects detailed information about lock waits in PostgreSQL, including: +- Waiting and blocking process IDs +- User names and application names +- Lock modes and types +- Affected tables +- Query IDs (PostgreSQL 14+) +- Wait durations and blocker transaction durations + +## Test Components + +### 1. Python Test Script (`test_lock_waits_metric.py`) + +Automated test that: +- Creates lock contention scenarios in the target database +- Waits for pgwatch to collect metrics +- Verifies the metric is collected in Prometheus/VictoriaMetrics +- Validates the metric structure and labels + +### 2. SQL Script (`create_lock_contention.sql`) + +Manual SQL script to create lock contention for testing. Can be run in multiple psql sessions. + +## Prerequisites + +1. Docker Compose stack running: + ```bash + docker-compose up -d + ``` + +2. Python dependencies: + ```bash + pip install psycopg requests + ``` + +3. Ensure `lock_waits` metric is enabled in pgwatch configuration: + - Check `config/pgwatch-prometheus/metrics.yml` includes `lock_waits` + - Verify pgwatch is collecting metrics from the target database + +## Running the Automated Test + +### Basic Usage + +```bash +# From the project root +python tests/lock_waits/test_lock_waits_metric.py +``` + +### With Custom Configuration + +```bash +python tests/lock_waits/test_lock_waits_metric.py \ + --target-db-url "postgresql://postgres:postgres@localhost:55432/target_database" \ + --prometheus-url "http://localhost:59090" \ + --test-dbname "target_database" \ + --collection-wait 90 +``` + +### Environment Variables + +You can also set these via environment variables: + +```bash +export TARGET_DB_URL="postgresql://postgres:postgres@localhost:55432/target_database" +export PROMETHEUS_URL="http://localhost:59090" +export TEST_DBNAME="target_database" +export COLLECTION_WAIT_SECONDS=90 + +python tests/lock_waits/test_lock_waits_metric.py +``` + +## Manual Testing + +### Step 1: Create Lock Contention + +Open two psql sessions to the target database: + +**Session 1 (Blocker):** +```sql +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +-- Keep this transaction open +``` + +**Session 2 (Waiter):** +```sql +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +-- This will wait for Session 1 to release the lock +``` + +### Step 2: Verify Metric Collection + +Wait for pgwatch to collect metrics (check collection interval in pgwatch config, typically 15-30 seconds), then query Prometheus: + +```bash +# Query Prometheus API for lock_waits metrics +curl "http://localhost:59090/api/v1/query?query=pgwatch_lock_waits_waiting_ms{datname=\"target_database\"}" + +# Or use PromQL in Grafana Explore +pgwatch_lock_waits_waiting_ms{datname="target_database"} +pgwatch_lock_waits_blocker_tx_ms{datname="target_database"} +``` + +### Step 3: Check Grafana Dashboard + +1. Open Grafana: http://localhost:3000 +2. Navigate to "Lock waits details" dashboard +3. Select the database from the dropdown +4. Verify that lock wait events appear in the panels + +## Expected Results + +### Successful Test Output + +``` +Setting up test environment... +✓ Test table created + +Creating lock contention for 30 seconds... +✓ Blocker transaction started (holding lock on row id=1) +✓ Waiter transaction started (waiting for lock on row id=1) + Holding locks for 30 seconds... +✓ Lock contention ended + +Verifying metric collection... + Waiting 60 seconds for pgwatch to collect metrics... + ✓ Found 5 lock_waits records + +Validating metric structure... + + Record 1: + ✓ All required data fields present + ✓ waiting_ms is numeric: 25000 ms + ✓ blocker_tx_ms is numeric: 30000 ms + +✅ Test PASSED: lock_waits metric is working correctly +``` + +## Troubleshooting + +### No Records Found + +- **Check pgwatch is running**: `docker ps | grep pgwatch-prometheus` +- **Check pgwatch logs**: `docker logs pgwatch-prometheus` +- **Verify metric is enabled**: Check `config/pgwatch-prometheus/metrics.yml` +- **Check Prometheus is accessible**: `curl http://localhost:59090/api/v1/status/config` +- **Increase wait time**: Use `--collection-wait 120` to wait longer +- **Check database name**: Ensure `--test-dbname` matches the monitored database +- **Verify metrics exist**: `curl "http://localhost:59090/api/v1/label/__name__/values" | grep lock_waits` + +### Invalid Data Structure + +- **Check PostgreSQL version**: Metric requires PostgreSQL 14+ for query_id support +- **Verify metric SQL**: Check the SQL query in `metrics.yml` is correct +- **Check pgwatch version**: Ensure pgwatch version supports the metric format +- **Check Prometheus labels**: Verify metrics have expected labels (datname, waiting_pid, blocker_pid, etc.) + +### Connection Errors + +- **Verify Docker containers**: `docker-compose ps` +- **Check connection strings**: Verify URLs match your docker-compose configuration +- **Check Prometheus URL**: Ensure Prometheus/VictoriaMetrics is accessible at the specified URL +- **Check network**: Ensure containers can communicate (same Docker network) + +## Integration with CI/CD + +The test can be integrated into CI/CD pipelines: + +```yaml +# Example GitLab CI +test_lock_waits: + stage: test + script: + - docker-compose up -d + - sleep 30 # Wait for services to start + - pip install psycopg + - python tests/lock_waits/test_lock_waits_metric.py + --target-db-url "$TARGET_DB_URL" + --sink-db-url "$SINK_DB_URL" + --collection-wait 90 + only: + - merge_requests + - main +``` + +## Additional Test Scenarios + +### Test Different Lock Types + +Modify the test to create different types of locks: + +```sql +-- Table-level lock +LOCK TABLE lock_test_table IN EXCLUSIVE MODE; + +-- Advisory lock +SELECT pg_advisory_lock(12345); +``` + +### Test Multiple Concurrent Waits + +Create multiple waiting transactions to test the LIMIT clause: + +```sql +-- Session 1: Blocker +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; + +-- Sessions 2-10: Multiple waiters +-- Each in separate psql session +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +``` + +## Related Files + +- `config/pgwatch-prometheus/metrics.yml` - Metric definition +- `config/grafana/dashboards/Dashboard_13_Lock_waits.json` - Grafana dashboard +- `workload_examples/lock_wait_test.sql` - Basic lock test SQL + diff --git a/tests/lock_waits/__init__.py b/tests/lock_waits/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..228403c0df6b58a3285c604a1db3525b351d7ee9 --- /dev/null +++ b/tests/lock_waits/__init__.py @@ -0,0 +1,2 @@ +# Lock waits metric testing package + diff --git a/tests/lock_waits/create_lock_contention.sql b/tests/lock_waits/create_lock_contention.sql new file mode 100644 index 0000000000000000000000000000000000000000..5e5da7a1f8eeefb5bb784a2c33b2dd63b177a62f --- /dev/null +++ b/tests/lock_waits/create_lock_contention.sql @@ -0,0 +1,73 @@ +-- SQL script to manually create lock contention for testing lock_waits metric +-- +-- Usage: +-- 1. Run this script in Session 1 (blocker) +-- 2. Run the same script in Session 2 (waiter) - it will wait +-- 3. Check the sink database for lock_waits records +-- 4. Commit or rollback Session 1 to release the lock + +-- Create test table if it doesn't exist +drop table if exists lock_test_table cascade; +create table lock_test_table ( + id int8 generated always as identity primary key, + name text not null, + value numeric(10, 2), + created_at timestamptz default now() +); + +insert into lock_test_table (name, value) +values + ('Item 1', 100.50), + ('Item 2', 200.75), + ('Item 3', 300.25); + +-- ============================================ +-- SESSION 1 (BLOCKER) - Run this first +-- ============================================ +begin; + +-- Acquire exclusive lock on row id=1 +-- Keep this transaction open to hold the lock +select * from lock_test_table where id = 1 for update; + +-- Transaction is now holding the lock +-- DO NOT COMMIT YET - keep this session open + +-- ============================================ +-- SESSION 2 (WAITER) - Run this in another psql session +-- ============================================ +begin; + +-- This will wait for Session 1 to release the lock +select * from lock_test_table where id = 1 for update; + +-- This query will block until Session 1 commits or rolls back +-- You should see it waiting in pg_stat_activity + +-- ============================================ +-- To release the lock, commit or rollback Session 1: +-- ============================================ +-- commit; -- or rollback; + +-- ============================================ +-- Alternative: Test with different lock types +-- ============================================ + +-- Test with table-level lock +-- SESSION 1: +-- begin; +-- lock table lock_test_table in exclusive mode; + +-- SESSION 2: +-- begin; +-- select * from lock_test_table; -- Will wait + +-- Test with advisory lock +-- SESSION 1: +-- begin; +-- select pg_advisory_lock(12345); + +-- SESSION 2: +-- begin; +-- select pg_advisory_lock(12345); -- Will wait + diff --git a/tests/lock_waits/run_test.sh b/tests/lock_waits/run_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..de45803b0f5757504ad924113f1c1b0f3792fe0b --- /dev/null +++ b/tests/lock_waits/run_test.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Simple wrapper script to run the lock_waits metric test + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Default values (can be overridden by environment variables) +TARGET_DB_URL="${TARGET_DB_URL:-postgresql://postgres:postgres@localhost:55432/target_database}" +PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:59090}" +TEST_DBNAME="${TEST_DBNAME:-target_database}" +COLLECTION_WAIT="${COLLECTION_WAIT_SECONDS:-60}" + +echo "==========================================" +echo "Lock Waits Metric Test" +echo "==========================================" +echo "" +echo "Configuration:" +echo " Target DB: $TARGET_DB_URL" +echo " Prometheus URL: $PROMETHEUS_URL" +echo " Test DB Name: $TEST_DBNAME" +echo " Collection Wait: ${COLLECTION_WAIT}s" +echo "" + +# Check if required packages are installed +if ! python3 -c "import psycopg" 2>/dev/null; then + echo "Installing psycopg..." + pip3 install psycopg +fi + +if ! python3 -c "import requests" 2>/dev/null; then + echo "Installing requests..." + pip3 install requests +fi + +# Run the test +cd "$PROJECT_ROOT" +python3 tests/lock_waits/test_lock_waits_metric.py \ + --target-db-url "$TARGET_DB_URL" \ + --prometheus-url "$PROMETHEUS_URL" \ + --test-dbname "$TEST_DBNAME" \ + --collection-wait "$COLLECTION_WAIT" + diff --git a/tests/lock_waits/test_lock_waits_metric.py b/tests/lock_waits/test_lock_waits_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..b4bbaca64804ff4c1a9dbb6abffc7be1748a6a31 --- /dev/null +++ b/tests/lock_waits/test_lock_waits_metric.py @@ -0,0 +1,426 @@ +""" +Test script to verify lock_waits metric collection. + +This script: +1. Creates lock contention scenarios in the target database +2. Waits for pgwatch to collect metrics +3. Verifies the lock_waits metric is collected in Prometheus +4. Validates the data structure and content +""" + +import json +import os +import threading +import time +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional + +import psycopg +import requests + + +class LockWaitsTest: + def __init__( + self, + target_db_url: str, + prometheus_url: str, + test_dbname: str = "target_database", + collection_wait_seconds: int = 60, + ): + """ + Initialize the test. + + Args: + target_db_url: Connection string for the target database being monitored + prometheus_url: URL for Prometheus/VictoriaMetrics API + test_dbname: Name of the database being monitored + collection_wait_seconds: How long to wait for pgwatch to collect metrics + """ + self.target_db_url = target_db_url + self.prometheus_url = prometheus_url.rstrip("/") + self.test_dbname = test_dbname + self.collection_wait_seconds = collection_wait_seconds + self.target_conn: Optional[psycopg.Connection] = None + self.blocker_conn: Optional[psycopg.Connection] = None + + def setup(self): + """Set up database connections and test table.""" + print("Setting up test environment...") + + # Connect to target database + self.target_conn = psycopg.connect(self.target_db_url) + self.target_conn.autocommit = True + + # Verify Prometheus is accessible + try: + response = requests.get(f"{self.prometheus_url}/api/v1/status/config", timeout=5) + response.raise_for_status() + print("✓ Prometheus connection verified") + except Exception as e: + print(f"⚠ Warning: Could not verify Prometheus connection: {e}") + + # Create test table + with self.target_conn.cursor() as cur: + cur.execute( + """ + drop table if exists lock_test_table cascade; + create table lock_test_table ( + id int8 generated always as identity primary key, + name text not null, + value numeric(10, 2), + created_at timestamptz default now() + ); + insert into lock_test_table (name, value) + values + ('Item 1', 100.50), + ('Item 2', 200.75), + ('Item 3', 300.25); + """ + ) + print("✓ Test table created") + + def create_lock_contention(self, duration_seconds: int = 30): + """ + Create lock contention by: + 1. Starting a transaction that locks a row + 2. Starting another transaction that tries to lock the same row (will wait) + 3. Keeping both transactions open for the specified duration + """ + print(f"\nCreating lock contention for {duration_seconds} seconds...") + + # Connection 1: Blocker - acquires lock and holds it + self.blocker_conn = psycopg.connect(self.target_db_url) + self.blocker_conn.autocommit = False + blocker_cur = self.blocker_conn.cursor() + blocker_cur.execute("begin") + blocker_cur.execute( + "select * from lock_test_table where id = 1 for update" + ) + blocker_cur.fetchone() + print("✓ Blocker transaction started (holding lock on row id=1)") + + # Small delay to ensure blocker has the lock + time.sleep(1) + + # Connection 2: Waiter - tries to acquire same lock (will wait) + waiter_conn = psycopg.connect(self.target_db_url) + waiter_conn.autocommit = False + waiter_cur = waiter_conn.cursor() + waiter_cur.execute("begin") + print("✓ Waiter transaction started (waiting for lock on row id=1)") + + # Execute the waiting query in a separate thread so it can block + waiter_error = [] + waiter_done = threading.Event() + + def run_waiter(): + try: + # This will block until blocker releases the lock + waiter_cur.execute( + "select * from lock_test_table where id = 1 for update" + ) + waiter_cur.fetchone() + print(" ✓ Waiter acquired lock (blocker released)") + except Exception as e: + waiter_error.append(str(e)) + print(f" Waiter error: {e}") + finally: + waiter_done.set() + + waiter_thread = threading.Thread(target=run_waiter, daemon=True) + waiter_thread.start() + + # Give waiter time to start waiting + time.sleep(2) + + # Verify waiter is actually waiting + with self.target_conn.cursor() as check_cur: + check_cur.execute( + """ + select pid, state, wait_event_type, wait_event + from pg_stat_activity + where datname = current_database() + and pid <> pg_backend_pid() + and wait_event_type = 'Lock' + """ + ) + waiting_pids = check_cur.fetchall() + if waiting_pids: + print(f" ✓ Confirmed {len(waiting_pids)} process(es) waiting for locks") + for pid, state, wait_type, wait_event in waiting_pids: + print(f" PID {pid}: state={state}, wait_event={wait_event}") + else: + print(" ⚠ No processes found waiting for locks") + + # Keep locks held for the duration + print(f" Holding locks for {duration_seconds} seconds...") + time.sleep(duration_seconds) + + # Cleanup: commit blocker first, then waiter + print(" Releasing blocker lock...") + blocker_cur.execute("commit") + blocker_cur.close() + self.blocker_conn.close() + self.blocker_conn = None + + # Wait for waiter to complete + waiter_done.wait(timeout=5) + try: + waiter_cur.execute("commit") + except Exception: + pass + waiter_cur.close() + waiter_conn.close() + + print("✓ Lock contention ended") + + def verify_metric_collected(self) -> List[Dict]: + """ + Verify that lock_waits metric was collected in Prometheus. + + Returns: + List of lock_waits metric samples found + """ + print("\nVerifying metric collection...") + + # Wait for pgwatch to collect metrics + print(f" Waiting {self.collection_wait_seconds} seconds for pgwatch to collect metrics...") + time.sleep(self.collection_wait_seconds) + + # Query Prometheus for lock_waits metrics + # pgwatch exports metrics with prefix pgwatch__ + metrics_to_check = [ + "pgwatch_lock_waits_waiting_ms", + "pgwatch_lock_waits_blocker_tx_ms", + ] + + records = [] + cutoff_time = datetime.now(timezone.utc) - timedelta(minutes=5) + + for metric_name in metrics_to_check: + try: + # Query for recent samples + query = f'{metric_name}{{datname="{self.test_dbname}"}}' + response = requests.get( + f"{self.prometheus_url}/api/v1/query", + params={ + "query": query, + "time": datetime.now(timezone.utc).timestamp(), + }, + timeout=10, + ) + response.raise_for_status() + data = response.json() + + if data.get("status") == "success" and data.get("data", {}).get("result"): + for result in data["data"]["result"]: + metric = result.get("metric", {}) + value = result.get("value", [None, None]) + + # Convert timestamp + timestamp = float(value[0]) if value[0] else None + if timestamp: + metric_time = datetime.fromtimestamp(timestamp, tz=timezone.utc) + if metric_time >= cutoff_time: + records.append( + { + "time": metric_time, + "metric": metric_name, + "labels": metric, + "value": float(value[1]) if value[1] else None, + } + ) + except Exception as e: + print(f" ⚠ Error querying {metric_name}: {e}") + + print(f" ✓ Found {len(records)} lock_waits metric samples") + + return records + + def validate_metric_structure(self, records: List[Dict]) -> bool: + """ + Validate that the metric records have the expected structure. + + Args: + records: List of metric samples to validate + + Returns: + True if validation passes, False otherwise + """ + if not records: + print(" ⚠ No records to validate") + return False + + print("\nValidating metric structure...") + + # Expected labels in Prometheus metrics + expected_labels = [ + "datname", + "waiting_user", + "waiting_appname", + "waiting_table", + "waiting_query_id", + "waiting_mode", + "waiting_locktype", + "waiting_pid", + "blocker_user", + "blocker_appname", + "blocker_table", + "blocker_query_id", + "blocker_mode", + "blocker_locktype", + "blocker_pid", + ] + + all_valid = True + unique_samples = {} + + # Group samples by their label combination + for record in records: + labels = record.get("labels", {}) + # Create a key from relevant labels + key = ( + labels.get("waiting_pid"), + labels.get("blocker_pid"), + labels.get("waiting_table"), + ) + if key not in unique_samples: + unique_samples[key] = record + + print(f" Found {len(unique_samples)} unique lock wait samples") + + for i, (key, record) in enumerate(list(unique_samples.items())[:5]): # Validate first 5 + print(f"\n Sample {i+1}:") + labels = record.get("labels", {}) + metric_name = record.get("metric", "") + value = record.get("value") + + # Check datname matches + if labels.get("datname") != self.test_dbname: + print(f" ⚠ datname mismatch: {labels.get('datname')} != {self.test_dbname}") + else: + print(f" ✓ datname matches: {labels.get('datname')}") + + # Check key labels are present + key_labels = ["waiting_pid", "blocker_pid", "waiting_mode", "blocker_mode"] + missing_labels = [label for label in key_labels if not labels.get(label)] + if missing_labels: + print(f" ⚠ Missing key labels: {missing_labels}") + else: + print(f" ✓ Key labels present") + + # Validate metric value + if value is not None: + try: + float(value) + print(f" ✓ Metric value is numeric: {value}") + if "waiting_ms" in metric_name or "blocker_tx_ms" in metric_name: + print(f" Value: {value} ms") + except (ValueError, TypeError): + print(f" ✗ Metric value is not numeric: {value}") + all_valid = False + else: + print(f" ⚠ Metric value is None") + + return all_valid + + def cleanup(self): + """Clean up test resources.""" + print("\nCleaning up...") + + if self.blocker_conn: + try: + self.blocker_conn.close() + except Exception: + pass + + if self.target_conn: + try: + with self.target_conn.cursor() as cur: + cur.execute("drop table if exists lock_test_table cascade") + self.target_conn.close() + except Exception: + pass + + print("✓ Cleanup complete") + + def run(self) -> bool: + """ + Run the complete test. + + Returns: + True if test passes, False otherwise + """ + try: + self.setup() + self.create_lock_contention(duration_seconds=30) + records = self.verify_metric_collected() + is_valid = self.validate_metric_structure(records) + + if is_valid and records: + print("\n✅ Test PASSED: lock_waits metric is working correctly") + return True + else: + print("\n❌ Test FAILED: lock_waits metric validation failed") + return False + + except Exception as e: + print(f"\n❌ Test ERROR: {e}") + import traceback + + traceback.print_exc() + return False + finally: + self.cleanup() + + +def main(): + """Main entry point for the test.""" + import argparse + + parser = argparse.ArgumentParser( + description="Test lock_waits metric collection" + ) + parser.add_argument( + "--target-db-url", + default=os.getenv( + "TARGET_DB_URL", "postgresql://postgres:postgres@localhost:55432/target_database" + ), + help="Target database connection URL", + ) + parser.add_argument( + "--prometheus-url", + default=os.getenv( + "PROMETHEUS_URL", + "http://localhost:59090", + ), + help="Prometheus/VictoriaMetrics API URL", + ) + parser.add_argument( + "--test-dbname", + default=os.getenv("TEST_DBNAME", "target_database"), + help="Name of the database being monitored", + ) + parser.add_argument( + "--collection-wait", + type=int, + default=int(os.getenv("COLLECTION_WAIT_SECONDS", "60")), + help="Seconds to wait for pgwatch to collect metrics", + ) + + args = parser.parse_args() + + test = LockWaitsTest( + target_db_url=args.target_db_url, + prometheus_url=args.prometheus_url, + test_dbname=args.test_dbname, + collection_wait_seconds=args.collection_wait, + ) + + success = test.run() + exit(0 if success else 1) + + +if __name__ == "__main__": + main() +