From 3726066b8fe97265e6b0a7b19a2f953eb63c1269 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Mon, 1 Dec 2025 23:58:52 +0200 Subject: [PATCH 1/7] feat(monitoring): add lock_waits metric and dashboard Add lock_waits metric to pgwatch-prometheus that provides detailed information about lock waits, including waiting and blocking processes with their queries, users, application names, and wait durations. - Add lock_waits metric with PostgreSQL 14+ support using query_id - Create Grafana dashboard with panels for lock wait counts, durations, lock types, and detailed lock wait information table --- .../dashboards/Dashboard_13_Lock_waits.json | 768 ++++++++++++++++++ config/pgwatch-prometheus/metrics.yml | 50 ++ 2 files changed, 818 insertions(+) create mode 100644 config/grafana/dashboards/Dashboard_13_Lock_waits.json diff --git a/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/config/grafana/dashboards/Dashboard_13_Lock_waits.json new file mode 100644 index 0000000..58e918d --- /dev/null +++ b/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -0,0 +1,768 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "description": "Number of active lock waits over time, showing how many processes are waiting for locks.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": -1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "editorMode": "code", + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": "select\n $__timeGroup(time, $agg_interval),\n count(*) as \"Lock waits\"\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\ngroup by 1\norder by 1", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + }, + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Lock waits count", + "type": "timeseries" + }, + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "description": "Lock wait duration over time, showing how long processes have been waiting for locks.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "editorMode": "code", + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": "select\n $__timeGroup(time, $agg_interval),\n avg((data->>'waiting_ms')::bigint) as \"Average wait time\",\n max((data->>'waiting_ms')::bigint) as \"Max wait time\"\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\ngroup by 1\norder by 1", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + }, + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Lock wait duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "description": "Lock waits by lock type, showing which types of locks are causing the most contention.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "editorMode": "code", + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": "select\n $__timeGroup(time, $agg_interval),\n coalesce((data->>'waiting_locktype')::text, 'unknown') as locktype,\n count(*) as count\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\ngroup by 1, 2\norder by 1, 3 desc", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + }, + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Lock waits by lock type", + "type": "timeseries" + }, + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "description": "Detailed lock wait information showing waiting and blocking processes, their queries, users, and wait durations.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "blocker_pid" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "waiting_pid" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "time" + }, + "properties": [ + { + "id": "custom.width", + "value": 170 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "waiting_ms" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_tx_ms" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "tag_waiting_query_id" + }, + "properties": [ + { + "id": "custom.inspect", + "value": true + }, + { + "id": "custom.width", + "value": 200 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "tag_blocker_query_id" + }, + "properties": [ + { + "id": "custom.inspect", + "value": true + }, + { + "id": "custom.width", + "value": 200 + } + ] + } + ] + }, + "gridPos": { + "h": 16, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "waiting_ms" + } + ] + }, + "pluginVersion": "10.4.7", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "editorMode": "code", + "format": "table", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": "select\n time,\n (data->>'waiting_pid')::text as waiting_pid,\n (tag_data->>'waiting_user')::text as waiting_user,\n (tag_data->>'waiting_appname')::text as waiting_appname,\n (data->>'waiting_mode')::text as waiting_mode,\n (data->>'waiting_locktype')::text as waiting_locktype,\n (tag_data->>'waiting_table')::text as waiting_table,\n (tag_data->>'waiting_query_id')::text as tag_waiting_query_id,\n (data->>'waiting_ms')::bigint as waiting_ms,\n (data->>'blocker_pid')::text as blocker_pid,\n (tag_data->>'blocker_user')::text as blocker_user,\n (tag_data->>'blocker_appname')::text as blocker_appname,\n (data->>'blocker_mode')::text as blocker_mode,\n (data->>'blocker_locktype')::text as blocker_locktype,\n (tag_data->>'blocker_table')::text as blocker_table,\n (tag_data->>'blocker_query_id')::text as tag_blocker_query_id,\n (data->>'blocker_tx_ms')::bigint as blocker_tx_ms\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\norder by waiting_ms desc\nlimit 1000;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + }, + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Lock waits details", + "type": "table" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [ + "locks", + "performance" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "postgres", + "uid": "P031DD592934B2F1F" + }, + "definition": "SELECT DISTINCT dbname FROM lock_waits ORDER BY 1;", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "dbname", + "options": [], + "query": "SELECT DISTINCT dbname FROM lock_waits ORDER BY 1;", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "auto": true, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "1h", + "value": "1h" + }, + "hide": 0, + "name": "agg_interval", + "options": [ + { + "selected": false, + "text": "auto", + "value": "$__auto_interval_agg_interval" + }, + { + "selected": false, + "text": "10s", + "value": "10s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": true, + "text": "1h", + "value": "1h" + } + ], + "query": "10s, 1m, 5m, 1h", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m" + ] + }, + "timezone": "", + "title": "Lock waits details", + "uid": "lock-waits-details", + "version": 1, + "weekStart": "" +} + diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index c052b18..375f9d0 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -817,6 +817,56 @@ metrics: gauges: - queries statement_timeout_seconds: 15 + lock_waits: + description: > + Retrieves detailed information about lock waits, including waiting and blocking processes with their queries, users, and application names. + It returns waiting and blocker process IDs, lock modes and types, affected tables, queries, and wait/transaction durations. + This metric helps administrators identify and diagnose lock contention issues in detail. + sqls: + 14: |- + with sa_snapshot as ( /* pgwatch_generated */ + select * + from pg_stat_activity + where + datname = current_database() + and pid <> pg_backend_pid() + and state <> 'idle' + ) + select + waiting.pid as waiting_pid, + waiting_stm.usename::text as tag_waiting_user, + waiting_stm.application_name::text as tag_waiting_appname, + waiting.mode as waiting_mode, + waiting.locktype as waiting_locktype, + waiting.relation::regclass::text as tag_waiting_table, + waiting_stm.query_id::text as tag_waiting_query_id, + (extract(epoch from (now() - waiting_stm.state_change)) * 1000)::bigint as waiting_ms, + blocker.pid as blocker_pid, + blocker_stm.usename::text as tag_blocker_user, + blocker_stm.application_name::text as tag_blocker_appname, + blocker.mode as blocker_mode, + blocker.locktype as blocker_locktype, + blocker.relation::regclass::text as tag_blocker_table, + blocker_stm.query_id::text as tag_blocker_query_id, + (extract(epoch from (now() - blocker_stm.xact_start)) * 1000)::bigint as blocker_tx_ms + from pg_catalog.pg_locks as waiting + join sa_snapshot as waiting_stm on waiting_stm.pid = waiting.pid + join pg_catalog.pg_locks as blocker on + waiting.pid <> blocker.pid + and blocker.granted + and waiting.database = blocker.database + and ( + waiting.relation = blocker.relation + or waiting.transactionid = blocker.transactionid + ) + join sa_snapshot as blocker_stm on blocker_stm.pid = blocker.pid + where not waiting.granted + order by waiting_ms desc + limit 10000 + gauges: + - waiting_ms + - blocker_tx_ms + statement_timeout_seconds: 15 pg_database_wraparound: sqls: 11: | -- GitLab From 95cb28462bbcfa1badd5bc05186ca9b3a807427d Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 2 Dec 2025 00:52:08 +0200 Subject: [PATCH 2/7] Added lock_waits to full metrics preset --- config/pgwatch-prometheus/metrics.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 375f9d0..b5917dd 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -2374,6 +2374,7 @@ presets: pg_statio_all_tables: 30 pg_statio_all_indexes: 30 pg_total_relation_size: 30 + lock_waits: 30 pg_blocked: 30 pg_long_running_transactions: 30 pg_stuck_idle_in_transaction: 30 -- GitLab From 0f721850027688daf5e28c601cb39f56eb67cc98 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 2 Dec 2025 14:54:27 +0200 Subject: [PATCH 3/7] feat(lock_waits): improve metric and dashboard with table info and query links - Fix SQL to handle NULL database fields for transactionid locks - Add CTE to derive table names from tuple/relation locks when direct relation is NULL - Rename fields from waiting_* to blocked_* for clarity - Update dashboard to use Prometheus datasource with PromQL queries - Add datname column to details table - Add clickable links from query_id columns to Single Query Analysis dashboard - Include blocked_table and blocker_table in metric labels - Add automated test script for lock_waits metric validation --- .../dashboards/Dashboard_13_Lock_waits.json | 863 ++++++++++++------ config/pgwatch-prometheus/metrics.yml | 59 +- tests/lock_waits/README.md | 226 +++++ tests/lock_waits/__init__.py | 2 + tests/lock_waits/create_lock_contention.sql | 73 ++ tests/lock_waits/run_test.sh | 44 + tests/lock_waits/test_lock_waits_metric.py | 426 +++++++++ 7 files changed, 1414 insertions(+), 279 deletions(-) create mode 100644 tests/lock_waits/README.md create mode 100644 tests/lock_waits/__init__.py create mode 100644 tests/lock_waits/create_lock_contention.sql create mode 100755 tests/lock_waits/run_test.sh create mode 100644 tests/lock_waits/test_lock_waits_metric.py diff --git a/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/config/grafana/dashboards/Dashboard_13_Lock_waits.json index 58e918d..ba6cd84 100644 --- a/config/grafana/dashboards/Dashboard_13_Lock_waits.json +++ b/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -11,27 +11,34 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, "type": "dashboard" } ] }, - "editable": false, + "editable": true, "fiscalYearStartMonth": 0, - "graphTooltip": 0, + "graphTooltip": 1, "id": null, "links": [], "liveNow": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "Lock waits overview", + "type": "row" + }, { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, "description": "Number of active lock waits over time, showing how many processes are waiting for locks.", "fieldConfig": { @@ -45,7 +52,9 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "barAlignment": -1, + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, "drawStyle": "bars", "fillOpacity": 100, "gradientMode": "opacity", @@ -101,7 +110,7 @@ "h": 8, "w": 24, "x": 0, - "y": 0 + "y": 1 }, "id": 1, "options": { @@ -122,51 +131,14 @@ "targets": [ { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, "editorMode": "code", - "format": "time_series", - "group": [], - "metricColumn": "none", - "rawQuery": true, - "rawSql": "select\n $__timeGroup(time, $agg_interval),\n count(*) as \"Lock waits\"\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\ngroup by 1\norder by 1", - "refId": "A", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "column" - } - ] - ], - "sql": { - "columns": [ - { - "parameters": [], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" - }, - "type": "groupBy" - } - ], - "limit": 50 - }, - "timeColumn": "time", - "where": [ - { - "name": "$__timeFilter", - "params": [], - "type": "macro" - } - ] + "expr": "count(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"}) or vector(0)", + "legendFormat": "Lock waits", + "range": true, + "refId": "A" } ], "title": "Lock waits count", @@ -174,8 +146,8 @@ }, { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, "description": "Lock wait duration over time, showing how long processes have been waiting for locks.", "fieldConfig": { @@ -189,9 +161,10 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 20, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, @@ -200,7 +173,7 @@ }, "insertNulls": false, "lineInterpolation": "linear", - "lineWidth": 2, + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -242,7 +215,7 @@ "h": 8, "w": 12, "x": 0, - "y": 8 + "y": 9 }, "id": 2, "options": { @@ -264,51 +237,25 @@ "targets": [ { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, "editorMode": "code", - "format": "time_series", - "group": [], - "metricColumn": "none", - "rawQuery": true, - "rawSql": "select\n $__timeGroup(time, $agg_interval),\n avg((data->>'waiting_ms')::bigint) as \"Average wait time\",\n max((data->>'waiting_ms')::bigint) as \"Max wait time\"\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\ngroup by 1\norder by 1", - "refId": "A", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "column" - } - ] - ], - "sql": { - "columns": [ - { - "parameters": [], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" - }, - "type": "groupBy" - } - ], - "limit": 50 - }, - "timeColumn": "time", - "where": [ - { - "name": "$__timeFilter", - "params": [], - "type": "macro" - } - ] + "expr": "avg(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Average wait time", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "max(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Max wait time", + "range": true, + "refId": "B" } ], "title": "Lock wait duration", @@ -316,8 +263,8 @@ }, { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, "description": "Lock waits by lock type, showing which types of locks are causing the most contention.", "fieldConfig": { @@ -331,7 +278,9 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "axisSoftMin": 0, "barAlignment": 0, + "barWidthFactor": 1, "drawStyle": "bars", "fillOpacity": 100, "gradientMode": "opacity", @@ -376,7 +325,7 @@ "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 9 }, "id": 3, "options": { @@ -397,62 +346,253 @@ "targets": [ { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, "editorMode": "code", - "format": "time_series", - "group": [], - "metricColumn": "none", - "rawQuery": true, - "rawSql": "select\n $__timeGroup(time, $agg_interval),\n coalesce((data->>'waiting_locktype')::text, 'unknown') as locktype,\n count(*) as count\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\ngroup by 1, 2\norder by 1, 3 desc", - "refId": "A", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "column" + "expr": "count by (blocked_locktype) (pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "{{blocked_locktype}}", + "range": true, + "refId": "A" + } + ], + "title": "Lock waits by lock type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Blocker transaction duration over time, showing how long blocking transactions have been running.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5000 + }, + { + "color": "red", + "value": 30000 } ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" ], - "sql": { - "columns": [ - { - "parameters": [], - "type": "function" - } - ], - "groupBy": [ + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "avg(pgwatch_lock_waits_blocker_tx_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Average blocker tx duration", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "max(pgwatch_lock_waits_blocker_tx_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"})", + "legendFormat": "Max blocker tx duration", + "range": true, + "refId": "B" + } + ], + "title": "Blocker transaction duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "description": "Lock waits by blocked table, showing which tables have the most lock contention.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 1, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "property": { - "type": "string" - }, - "type": "groupBy" + "color": "green", + "value": null } - ], - "limit": 50 - }, - "timeColumn": "time", - "where": [ - { - "name": "$__timeFilter", - "params": [], - "type": "macro" - } - ] + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "count by (blocked_table) (pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", blocked_table!=\"\"})", + "legendFormat": "{{blocked_table}}", + "range": true, + "refId": "A" } ], - "title": "Lock waits by lock type", + "title": "Lock waits by table", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 11, + "panels": [], + "title": "Lock waits details", + "type": "row" + }, { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, - "description": "Detailed lock wait information showing waiting and blocking processes, their queries, users, and wait durations.", + "description": "Detailed lock wait information showing blocked and blocking processes with their users, applications, lock types, tables, query IDs, and wait durations.", "fieldConfig": { "defaults": { "color": { @@ -489,59 +629,103 @@ { "matcher": { "id": "byName", - "options": "blocker_pid" + "options": "blocked_pid" }, "properties": [ { "id": "custom.width", - "value": 110 + "value": 90 + }, + { + "id": "displayName", + "value": "Blocked PID" } ] }, { "matcher": { "id": "byName", - "options": "waiting_pid" + "options": "blocker_pid" }, "properties": [ { "id": "custom.width", - "value": 110 + "value": 90 + }, + { + "id": "displayName", + "value": "Blocker PID" } ] }, { "matcher": { "id": "byName", - "options": "time" + "options": "Time" }, "properties": [ { "id": "custom.width", - "value": 170 + "value": 160 } ] }, { "matcher": { "id": "byName", - "options": "waiting_ms" + "options": "Value" }, "properties": [ { "id": "custom.width", - "value": 120 + "value": 100 }, { "id": "unit", "value": "ms" + }, + { + "id": "displayName", + "value": "Blocked ms" } ] }, { "matcher": { "id": "byName", - "options": "blocker_tx_ms" + "options": "blocked_user" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "displayName", + "value": "Blocked User" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_user" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + }, + { + "id": "displayName", + "value": "Blocker User" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_appname" }, "properties": [ { @@ -549,40 +733,186 @@ "value": 120 }, { - "id": "unit", - "value": "ms" + "id": "displayName", + "value": "Blocked App" } ] }, { "matcher": { "id": "byName", - "options": "tag_waiting_query_id" + "options": "blocker_appname" }, "properties": [ { - "id": "custom.inspect", - "value": true + "id": "custom.width", + "value": 120 }, + { + "id": "displayName", + "value": "Blocker App" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_locktype" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocked Lock Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_locktype" + }, + "properties": [ + { + "id": "custom.width", + "value": 110 + }, + { + "id": "displayName", + "value": "Blocker Lock Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_mode" + }, + "properties": [ { "id": "custom.width", - "value": 200 + "value": 110 + }, + { + "id": "displayName", + "value": "Blocked Mode" } ] }, { "matcher": { "id": "byName", - "options": "tag_blocker_query_id" + "options": "blocker_mode" }, "properties": [ { - "id": "custom.inspect", - "value": true + "id": "custom.width", + "value": 110 }, + { + "id": "displayName", + "value": "Blocker Mode" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_table" + }, + "properties": [ { "id": "custom.width", - "value": 200 + "value": 130 + }, + { + "id": "displayName", + "value": "Blocked Table" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_table" + }, + "properties": [ + { + "id": "custom.width", + "value": 130 + }, + { + "id": "displayName", + "value": "Blocker Table" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "datname" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + }, + { + "id": "displayName", + "value": "Database" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocked_query_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + }, + { + "id": "displayName", + "value": "Blocked Query ID" + }, + { + "id": "links", + "value": [ + { + "title": "View query analysis", + "url": "/d/db52944d-b025-4e18-b70b-89c0af3e7e41/03-single-queryid-analysis?var-cluster_name=${cluster_name}&var-node_name=${node_name}&var-db_name=${db_name}&var-query_id=${__value.raw}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "blocker_query_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + }, + { + "id": "displayName", + "value": "Blocker Query ID" + }, + { + "id": "links", + "value": [ + { + "title": "View query analysis", + "url": "/d/db52944d-b025-4e18-b70b-89c0af3e7e41/03-single-queryid-analysis?var-cluster_name=${cluster_name}&var-node_name=${node_name}&var-db_name=${db_name}&var-query_id=${__value.raw}" + } + ] } ] } @@ -592,7 +922,7 @@ "h": 16, "w": 24, "x": 0, - "y": 16 + "y": 26 }, "id": 4, "options": { @@ -610,7 +940,7 @@ "sortBy": [ { "desc": true, - "displayName": "waiting_ms" + "displayName": "Blocked ms" } ] }, @@ -618,62 +948,68 @@ "targets": [ { "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, "editorMode": "code", + "exemplar": false, + "expr": "pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"}", "format": "table", - "group": [], - "metricColumn": "none", - "rawQuery": true, - "rawSql": "select\n time,\n (data->>'waiting_pid')::text as waiting_pid,\n (tag_data->>'waiting_user')::text as waiting_user,\n (tag_data->>'waiting_appname')::text as waiting_appname,\n (data->>'waiting_mode')::text as waiting_mode,\n (data->>'waiting_locktype')::text as waiting_locktype,\n (tag_data->>'waiting_table')::text as waiting_table,\n (tag_data->>'waiting_query_id')::text as tag_waiting_query_id,\n (data->>'waiting_ms')::bigint as waiting_ms,\n (data->>'blocker_pid')::text as blocker_pid,\n (tag_data->>'blocker_user')::text as blocker_user,\n (tag_data->>'blocker_appname')::text as blocker_appname,\n (data->>'blocker_mode')::text as blocker_mode,\n (data->>'blocker_locktype')::text as blocker_locktype,\n (tag_data->>'blocker_table')::text as blocker_table,\n (tag_data->>'blocker_query_id')::text as tag_blocker_query_id,\n (data->>'blocker_tx_ms')::bigint as blocker_tx_ms\nfrom lock_waits\nwhere dbname = '$dbname' and $__timeFilter(time)\norder by waiting_ms desc\nlimit 1000;", - "refId": "A", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "column" - } - ] - ], - "sql": { - "columns": [ - { - "parameters": [], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" - }, - "type": "groupBy" - } - ], - "limit": 50 - }, - "timeColumn": "time", - "where": [ - { - "name": "$__timeFilter", - "params": [], - "type": "macro" - } - ] + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" } ], "title": "Lock waits details", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "cluster": true, + "dbname": true, + "env": true, + "instance": true, + "job": true, + "node_name": true, + "real_dbname": true, + "sink_type": true, + "sys_id": true + }, + "indexByName": { + "Time": 0, + "datname": 1, + "blocked_pid": 2, + "blocked_user": 3, + "blocked_appname": 4, + "blocked_locktype": 5, + "blocked_mode": 6, + "blocked_table": 7, + "blocked_query_id": 8, + "Value": 9, + "blocker_pid": 10, + "blocker_user": 11, + "blocker_appname": 12, + "blocker_locktype": 13, + "blocker_mode": 14, + "blocker_table": 15, + "blocker_query_id": 16 + }, + "renameByName": {} + } + } + ], "type": "table" } ], - "refresh": "", + "refresh": "30s", "schemaVersion": 39, "tags": [ "locks", - "performance" + "performance", + "postgres" ], "templating": { "list": [ @@ -684,16 +1020,20 @@ "value": "" }, "datasource": { - "type": "postgres", - "uid": "P031DD592934B2F1F" + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" }, - "definition": "SELECT DISTINCT dbname FROM lock_waits ORDER BY 1;", + "definition": "label_values(pgwatch_db_size_size_b,cluster)", "hide": 0, "includeAll": false, "multi": false, - "name": "dbname", + "name": "cluster_name", "options": [], - "query": "SELECT DISTINCT dbname FROM lock_waits ORDER BY 1;", + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, "refresh": 1, "regex": "", "skipUrlSync": false, @@ -701,53 +1041,63 @@ "type": "query" }, { - "auto": true, - "auto_count": 30, - "auto_min": "10s", "current": { "selected": false, - "text": "1h", - "value": "1h" + "text": "", + "value": "" }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", "hide": 0, - "name": "agg_interval", - "options": [ - { - "selected": false, - "text": "auto", - "value": "$__auto_interval_agg_interval" - }, - { - "selected": false, - "text": "10s", - "value": "10s" - }, - { - "selected": false, - "text": "1m", - "value": "1m" - }, - { - "selected": false, - "text": "5m", - "value": "5m" - }, - { - "selected": true, - "text": "1h", - "value": "1h" - } - ], - "query": "10s, 1m, 5m, 1h", - "queryValue": "", - "refresh": 2, + "includeAll": false, + "multi": false, + "name": "node_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\"},node_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "definition": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "db_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_db_size_size_b{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", "skipUrlSync": false, - "type": "interval" + "sort": 0, + "type": "query" } ] }, "time": { - "from": "now-2d", + "from": "now-1h", "to": "now" }, "timepicker": { @@ -765,4 +1115,3 @@ "version": 1, "weekStart": "" } - diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index b5917dd..0c550a6 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -819,8 +819,8 @@ metrics: statement_timeout_seconds: 15 lock_waits: description: > - Retrieves detailed information about lock waits, including waiting and blocking processes with their queries, users, and application names. - It returns waiting and blocker process IDs, lock modes and types, affected tables, queries, and wait/transaction durations. + Retrieves detailed information about lock waits, including blocked and blocking processes with their queries, users, and application names. + It returns blocked and blocker process IDs, lock modes and types, affected tables, queries, and wait/transaction durations. This metric helps administrators identify and diagnose lock contention issues in detail. sqls: 14: |- @@ -830,41 +830,56 @@ metrics: where datname = current_database() and pid <> pg_backend_pid() - and state <> 'idle' + and state in ('active', 'idle in transaction', 'idle in transaction (aborted)') + ), + pid_tables as ( + select distinct on (pid) pid, relation::regclass::text as table_name + from pg_catalog.pg_locks + where relation is not null + and locktype in ('tuple', 'relation') + and relation::regclass::text not like '%_pkey' + and relation::regclass::text not like '%_idx' + order by pid, locktype ) select - waiting.pid as waiting_pid, - waiting_stm.usename::text as tag_waiting_user, - waiting_stm.application_name::text as tag_waiting_appname, - waiting.mode as waiting_mode, - waiting.locktype as waiting_locktype, - waiting.relation::regclass::text as tag_waiting_table, - waiting_stm.query_id::text as tag_waiting_query_id, - (extract(epoch from (now() - waiting_stm.state_change)) * 1000)::bigint as waiting_ms, + blocked.pid as blocked_pid, + current_database() as tag_datname, + blocked_stm.usename::text as tag_blocked_user, + blocked_stm.application_name::text as tag_blocked_appname, + blocked.mode as blocked_mode, + blocked.locktype as blocked_locktype, + coalesce(blocked.relation::regclass::text, blocked_tbl.table_name, '') as tag_blocked_table, + blocked_stm.query_id::text as tag_blocked_query_id, + (extract(epoch from (clock_timestamp() - blocked_stm.state_change)) * 1000)::bigint as blocked_ms, blocker.pid as blocker_pid, blocker_stm.usename::text as tag_blocker_user, blocker_stm.application_name::text as tag_blocker_appname, blocker.mode as blocker_mode, blocker.locktype as blocker_locktype, - blocker.relation::regclass::text as tag_blocker_table, + coalesce(blocker.relation::regclass::text, blocker_tbl.table_name, '') as tag_blocker_table, blocker_stm.query_id::text as tag_blocker_query_id, - (extract(epoch from (now() - blocker_stm.xact_start)) * 1000)::bigint as blocker_tx_ms - from pg_catalog.pg_locks as waiting - join sa_snapshot as waiting_stm on waiting_stm.pid = waiting.pid + (extract(epoch from (clock_timestamp() - blocker_stm.xact_start)) * 1000)::bigint as blocker_tx_ms + from pg_catalog.pg_locks as blocked + join sa_snapshot as blocked_stm on blocked_stm.pid = blocked.pid join pg_catalog.pg_locks as blocker on - waiting.pid <> blocker.pid + blocked.pid <> blocker.pid and blocker.granted - and waiting.database = blocker.database and ( - waiting.relation = blocker.relation - or waiting.transactionid = blocker.transactionid + (blocked.database = blocker.database) + or (blocked.database is null and blocker.database is null) + ) + and ( + blocked.relation = blocker.relation + or blocked.transactionid = blocker.transactionid ) join sa_snapshot as blocker_stm on blocker_stm.pid = blocker.pid - where not waiting.granted - order by waiting_ms desc + left join pid_tables as blocked_tbl on blocked_tbl.pid = blocked.pid + left join pid_tables as blocker_tbl on blocker_tbl.pid = blocker.pid + where not blocked.granted + order by blocked_ms desc limit 10000 gauges: - - waiting_ms + - blocked_ms - blocker_tx_ms statement_timeout_seconds: 15 pg_database_wraparound: diff --git a/tests/lock_waits/README.md b/tests/lock_waits/README.md new file mode 100644 index 0000000..3838b5c --- /dev/null +++ b/tests/lock_waits/README.md @@ -0,0 +1,226 @@ +# Lock Waits Metric Testing + +This directory contains tests and scripts to verify that the `lock_waits` metric is working correctly. + +## Overview + +The `lock_waits` metric collects detailed information about lock waits in PostgreSQL, including: +- Waiting and blocking process IDs +- User names and application names +- Lock modes and types +- Affected tables +- Query IDs (PostgreSQL 14+) +- Wait durations and blocker transaction durations + +## Test Components + +### 1. Python Test Script (`test_lock_waits_metric.py`) + +Automated test that: +- Creates lock contention scenarios in the target database +- Waits for pgwatch to collect metrics +- Verifies the metric is collected in Prometheus/VictoriaMetrics +- Validates the metric structure and labels + +### 2. SQL Script (`create_lock_contention.sql`) + +Manual SQL script to create lock contention for testing. Can be run in multiple psql sessions. + +## Prerequisites + +1. Docker Compose stack running: + ```bash + docker-compose up -d + ``` + +2. Python dependencies: + ```bash + pip install psycopg requests + ``` + +3. Ensure `lock_waits` metric is enabled in pgwatch configuration: + - Check `config/pgwatch-prometheus/metrics.yml` includes `lock_waits` + - Verify pgwatch is collecting metrics from the target database + +## Running the Automated Test + +### Basic Usage + +```bash +# From the project root +python tests/lock_waits/test_lock_waits_metric.py +``` + +### With Custom Configuration + +```bash +python tests/lock_waits/test_lock_waits_metric.py \ + --target-db-url "postgresql://postgres:postgres@localhost:55432/target_database" \ + --prometheus-url "http://localhost:59090" \ + --test-dbname "target_database" \ + --collection-wait 90 +``` + +### Environment Variables + +You can also set these via environment variables: + +```bash +export TARGET_DB_URL="postgresql://postgres:postgres@localhost:55432/target_database" +export PROMETHEUS_URL="http://localhost:59090" +export TEST_DBNAME="target_database" +export COLLECTION_WAIT_SECONDS=90 + +python tests/lock_waits/test_lock_waits_metric.py +``` + +## Manual Testing + +### Step 1: Create Lock Contention + +Open two psql sessions to the target database: + +**Session 1 (Blocker):** +```sql +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +-- Keep this transaction open +``` + +**Session 2 (Waiter):** +```sql +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +-- This will wait for Session 1 to release the lock +``` + +### Step 2: Verify Metric Collection + +Wait for pgwatch to collect metrics (check collection interval in pgwatch config, typically 15-30 seconds), then query Prometheus: + +```bash +# Query Prometheus API for lock_waits metrics +curl "http://localhost:59090/api/v1/query?query=pgwatch_lock_waits_waiting_ms{datname=\"target_database\"}" + +# Or use PromQL in Grafana Explore +pgwatch_lock_waits_waiting_ms{datname="target_database"} +pgwatch_lock_waits_blocker_tx_ms{datname="target_database"} +``` + +### Step 3: Check Grafana Dashboard + +1. Open Grafana: http://localhost:3000 +2. Navigate to "Lock waits details" dashboard +3. Select the database from the dropdown +4. Verify that lock wait events appear in the panels + +## Expected Results + +### Successful Test Output + +``` +Setting up test environment... +✓ Test table created + +Creating lock contention for 30 seconds... +✓ Blocker transaction started (holding lock on row id=1) +✓ Waiter transaction started (waiting for lock on row id=1) + Holding locks for 30 seconds... +✓ Lock contention ended + +Verifying metric collection... + Waiting 60 seconds for pgwatch to collect metrics... + ✓ Found 5 lock_waits records + +Validating metric structure... + + Record 1: + ✓ All required data fields present + ✓ waiting_ms is numeric: 25000 ms + ✓ blocker_tx_ms is numeric: 30000 ms + +✅ Test PASSED: lock_waits metric is working correctly +``` + +## Troubleshooting + +### No Records Found + +- **Check pgwatch is running**: `docker ps | grep pgwatch-prometheus` +- **Check pgwatch logs**: `docker logs pgwatch-prometheus` +- **Verify metric is enabled**: Check `config/pgwatch-prometheus/metrics.yml` +- **Check Prometheus is accessible**: `curl http://localhost:59090/api/v1/status/config` +- **Increase wait time**: Use `--collection-wait 120` to wait longer +- **Check database name**: Ensure `--test-dbname` matches the monitored database +- **Verify metrics exist**: `curl "http://localhost:59090/api/v1/label/__name__/values" | grep lock_waits` + +### Invalid Data Structure + +- **Check PostgreSQL version**: Metric requires PostgreSQL 14+ for query_id support +- **Verify metric SQL**: Check the SQL query in `metrics.yml` is correct +- **Check pgwatch version**: Ensure pgwatch version supports the metric format +- **Check Prometheus labels**: Verify metrics have expected labels (datname, waiting_pid, blocker_pid, etc.) + +### Connection Errors + +- **Verify Docker containers**: `docker-compose ps` +- **Check connection strings**: Verify URLs match your docker-compose configuration +- **Check Prometheus URL**: Ensure Prometheus/VictoriaMetrics is accessible at the specified URL +- **Check network**: Ensure containers can communicate (same Docker network) + +## Integration with CI/CD + +The test can be integrated into CI/CD pipelines: + +```yaml +# Example GitLab CI +test_lock_waits: + stage: test + script: + - docker-compose up -d + - sleep 30 # Wait for services to start + - pip install psycopg + - python tests/lock_waits/test_lock_waits_metric.py + --target-db-url "$TARGET_DB_URL" + --sink-db-url "$SINK_DB_URL" + --collection-wait 90 + only: + - merge_requests + - main +``` + +## Additional Test Scenarios + +### Test Different Lock Types + +Modify the test to create different types of locks: + +```sql +-- Table-level lock +LOCK TABLE lock_test_table IN EXCLUSIVE MODE; + +-- Advisory lock +SELECT pg_advisory_lock(12345); +``` + +### Test Multiple Concurrent Waits + +Create multiple waiting transactions to test the LIMIT clause: + +```sql +-- Session 1: Blocker +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; + +-- Sessions 2-10: Multiple waiters +-- Each in separate psql session +BEGIN; +SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE; +``` + +## Related Files + +- `config/pgwatch-prometheus/metrics.yml` - Metric definition +- `config/grafana/dashboards/Dashboard_13_Lock_waits.json` - Grafana dashboard +- `workload_examples/lock_wait_test.sql` - Basic lock test SQL + diff --git a/tests/lock_waits/__init__.py b/tests/lock_waits/__init__.py new file mode 100644 index 0000000..228403c --- /dev/null +++ b/tests/lock_waits/__init__.py @@ -0,0 +1,2 @@ +# Lock waits metric testing package + diff --git a/tests/lock_waits/create_lock_contention.sql b/tests/lock_waits/create_lock_contention.sql new file mode 100644 index 0000000..5e5da7a --- /dev/null +++ b/tests/lock_waits/create_lock_contention.sql @@ -0,0 +1,73 @@ +-- SQL script to manually create lock contention for testing lock_waits metric +-- +-- Usage: +-- 1. Run this script in Session 1 (blocker) +-- 2. Run the same script in Session 2 (waiter) - it will wait +-- 3. Check the sink database for lock_waits records +-- 4. Commit or rollback Session 1 to release the lock + +-- Create test table if it doesn't exist +drop table if exists lock_test_table cascade; +create table lock_test_table ( + id int8 generated always as identity primary key, + name text not null, + value numeric(10, 2), + created_at timestamptz default now() +); + +insert into lock_test_table (name, value) +values + ('Item 1', 100.50), + ('Item 2', 200.75), + ('Item 3', 300.25); + +-- ============================================ +-- SESSION 1 (BLOCKER) - Run this first +-- ============================================ +begin; + +-- Acquire exclusive lock on row id=1 +-- Keep this transaction open to hold the lock +select * from lock_test_table where id = 1 for update; + +-- Transaction is now holding the lock +-- DO NOT COMMIT YET - keep this session open + +-- ============================================ +-- SESSION 2 (WAITER) - Run this in another psql session +-- ============================================ +begin; + +-- This will wait for Session 1 to release the lock +select * from lock_test_table where id = 1 for update; + +-- This query will block until Session 1 commits or rolls back +-- You should see it waiting in pg_stat_activity + +-- ============================================ +-- To release the lock, commit or rollback Session 1: +-- ============================================ +-- commit; -- or rollback; + +-- ============================================ +-- Alternative: Test with different lock types +-- ============================================ + +-- Test with table-level lock +-- SESSION 1: +-- begin; +-- lock table lock_test_table in exclusive mode; + +-- SESSION 2: +-- begin; +-- select * from lock_test_table; -- Will wait + +-- Test with advisory lock +-- SESSION 1: +-- begin; +-- select pg_advisory_lock(12345); + +-- SESSION 2: +-- begin; +-- select pg_advisory_lock(12345); -- Will wait + diff --git a/tests/lock_waits/run_test.sh b/tests/lock_waits/run_test.sh new file mode 100755 index 0000000..de45803 --- /dev/null +++ b/tests/lock_waits/run_test.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Simple wrapper script to run the lock_waits metric test + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Default values (can be overridden by environment variables) +TARGET_DB_URL="${TARGET_DB_URL:-postgresql://postgres:postgres@localhost:55432/target_database}" +PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:59090}" +TEST_DBNAME="${TEST_DBNAME:-target_database}" +COLLECTION_WAIT="${COLLECTION_WAIT_SECONDS:-60}" + +echo "==========================================" +echo "Lock Waits Metric Test" +echo "==========================================" +echo "" +echo "Configuration:" +echo " Target DB: $TARGET_DB_URL" +echo " Prometheus URL: $PROMETHEUS_URL" +echo " Test DB Name: $TEST_DBNAME" +echo " Collection Wait: ${COLLECTION_WAIT}s" +echo "" + +# Check if required packages are installed +if ! python3 -c "import psycopg" 2>/dev/null; then + echo "Installing psycopg..." + pip3 install psycopg +fi + +if ! python3 -c "import requests" 2>/dev/null; then + echo "Installing requests..." + pip3 install requests +fi + +# Run the test +cd "$PROJECT_ROOT" +python3 tests/lock_waits/test_lock_waits_metric.py \ + --target-db-url "$TARGET_DB_URL" \ + --prometheus-url "$PROMETHEUS_URL" \ + --test-dbname "$TEST_DBNAME" \ + --collection-wait "$COLLECTION_WAIT" + diff --git a/tests/lock_waits/test_lock_waits_metric.py b/tests/lock_waits/test_lock_waits_metric.py new file mode 100644 index 0000000..b4bbaca --- /dev/null +++ b/tests/lock_waits/test_lock_waits_metric.py @@ -0,0 +1,426 @@ +""" +Test script to verify lock_waits metric collection. + +This script: +1. Creates lock contention scenarios in the target database +2. Waits for pgwatch to collect metrics +3. Verifies the lock_waits metric is collected in Prometheus +4. Validates the data structure and content +""" + +import json +import os +import threading +import time +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional + +import psycopg +import requests + + +class LockWaitsTest: + def __init__( + self, + target_db_url: str, + prometheus_url: str, + test_dbname: str = "target_database", + collection_wait_seconds: int = 60, + ): + """ + Initialize the test. + + Args: + target_db_url: Connection string for the target database being monitored + prometheus_url: URL for Prometheus/VictoriaMetrics API + test_dbname: Name of the database being monitored + collection_wait_seconds: How long to wait for pgwatch to collect metrics + """ + self.target_db_url = target_db_url + self.prometheus_url = prometheus_url.rstrip("/") + self.test_dbname = test_dbname + self.collection_wait_seconds = collection_wait_seconds + self.target_conn: Optional[psycopg.Connection] = None + self.blocker_conn: Optional[psycopg.Connection] = None + + def setup(self): + """Set up database connections and test table.""" + print("Setting up test environment...") + + # Connect to target database + self.target_conn = psycopg.connect(self.target_db_url) + self.target_conn.autocommit = True + + # Verify Prometheus is accessible + try: + response = requests.get(f"{self.prometheus_url}/api/v1/status/config", timeout=5) + response.raise_for_status() + print("✓ Prometheus connection verified") + except Exception as e: + print(f"⚠ Warning: Could not verify Prometheus connection: {e}") + + # Create test table + with self.target_conn.cursor() as cur: + cur.execute( + """ + drop table if exists lock_test_table cascade; + create table lock_test_table ( + id int8 generated always as identity primary key, + name text not null, + value numeric(10, 2), + created_at timestamptz default now() + ); + insert into lock_test_table (name, value) + values + ('Item 1', 100.50), + ('Item 2', 200.75), + ('Item 3', 300.25); + """ + ) + print("✓ Test table created") + + def create_lock_contention(self, duration_seconds: int = 30): + """ + Create lock contention by: + 1. Starting a transaction that locks a row + 2. Starting another transaction that tries to lock the same row (will wait) + 3. Keeping both transactions open for the specified duration + """ + print(f"\nCreating lock contention for {duration_seconds} seconds...") + + # Connection 1: Blocker - acquires lock and holds it + self.blocker_conn = psycopg.connect(self.target_db_url) + self.blocker_conn.autocommit = False + blocker_cur = self.blocker_conn.cursor() + blocker_cur.execute("begin") + blocker_cur.execute( + "select * from lock_test_table where id = 1 for update" + ) + blocker_cur.fetchone() + print("✓ Blocker transaction started (holding lock on row id=1)") + + # Small delay to ensure blocker has the lock + time.sleep(1) + + # Connection 2: Waiter - tries to acquire same lock (will wait) + waiter_conn = psycopg.connect(self.target_db_url) + waiter_conn.autocommit = False + waiter_cur = waiter_conn.cursor() + waiter_cur.execute("begin") + print("✓ Waiter transaction started (waiting for lock on row id=1)") + + # Execute the waiting query in a separate thread so it can block + waiter_error = [] + waiter_done = threading.Event() + + def run_waiter(): + try: + # This will block until blocker releases the lock + waiter_cur.execute( + "select * from lock_test_table where id = 1 for update" + ) + waiter_cur.fetchone() + print(" ✓ Waiter acquired lock (blocker released)") + except Exception as e: + waiter_error.append(str(e)) + print(f" Waiter error: {e}") + finally: + waiter_done.set() + + waiter_thread = threading.Thread(target=run_waiter, daemon=True) + waiter_thread.start() + + # Give waiter time to start waiting + time.sleep(2) + + # Verify waiter is actually waiting + with self.target_conn.cursor() as check_cur: + check_cur.execute( + """ + select pid, state, wait_event_type, wait_event + from pg_stat_activity + where datname = current_database() + and pid <> pg_backend_pid() + and wait_event_type = 'Lock' + """ + ) + waiting_pids = check_cur.fetchall() + if waiting_pids: + print(f" ✓ Confirmed {len(waiting_pids)} process(es) waiting for locks") + for pid, state, wait_type, wait_event in waiting_pids: + print(f" PID {pid}: state={state}, wait_event={wait_event}") + else: + print(" ⚠ No processes found waiting for locks") + + # Keep locks held for the duration + print(f" Holding locks for {duration_seconds} seconds...") + time.sleep(duration_seconds) + + # Cleanup: commit blocker first, then waiter + print(" Releasing blocker lock...") + blocker_cur.execute("commit") + blocker_cur.close() + self.blocker_conn.close() + self.blocker_conn = None + + # Wait for waiter to complete + waiter_done.wait(timeout=5) + try: + waiter_cur.execute("commit") + except Exception: + pass + waiter_cur.close() + waiter_conn.close() + + print("✓ Lock contention ended") + + def verify_metric_collected(self) -> List[Dict]: + """ + Verify that lock_waits metric was collected in Prometheus. + + Returns: + List of lock_waits metric samples found + """ + print("\nVerifying metric collection...") + + # Wait for pgwatch to collect metrics + print(f" Waiting {self.collection_wait_seconds} seconds for pgwatch to collect metrics...") + time.sleep(self.collection_wait_seconds) + + # Query Prometheus for lock_waits metrics + # pgwatch exports metrics with prefix pgwatch__ + metrics_to_check = [ + "pgwatch_lock_waits_waiting_ms", + "pgwatch_lock_waits_blocker_tx_ms", + ] + + records = [] + cutoff_time = datetime.now(timezone.utc) - timedelta(minutes=5) + + for metric_name in metrics_to_check: + try: + # Query for recent samples + query = f'{metric_name}{{datname="{self.test_dbname}"}}' + response = requests.get( + f"{self.prometheus_url}/api/v1/query", + params={ + "query": query, + "time": datetime.now(timezone.utc).timestamp(), + }, + timeout=10, + ) + response.raise_for_status() + data = response.json() + + if data.get("status") == "success" and data.get("data", {}).get("result"): + for result in data["data"]["result"]: + metric = result.get("metric", {}) + value = result.get("value", [None, None]) + + # Convert timestamp + timestamp = float(value[0]) if value[0] else None + if timestamp: + metric_time = datetime.fromtimestamp(timestamp, tz=timezone.utc) + if metric_time >= cutoff_time: + records.append( + { + "time": metric_time, + "metric": metric_name, + "labels": metric, + "value": float(value[1]) if value[1] else None, + } + ) + except Exception as e: + print(f" ⚠ Error querying {metric_name}: {e}") + + print(f" ✓ Found {len(records)} lock_waits metric samples") + + return records + + def validate_metric_structure(self, records: List[Dict]) -> bool: + """ + Validate that the metric records have the expected structure. + + Args: + records: List of metric samples to validate + + Returns: + True if validation passes, False otherwise + """ + if not records: + print(" ⚠ No records to validate") + return False + + print("\nValidating metric structure...") + + # Expected labels in Prometheus metrics + expected_labels = [ + "datname", + "waiting_user", + "waiting_appname", + "waiting_table", + "waiting_query_id", + "waiting_mode", + "waiting_locktype", + "waiting_pid", + "blocker_user", + "blocker_appname", + "blocker_table", + "blocker_query_id", + "blocker_mode", + "blocker_locktype", + "blocker_pid", + ] + + all_valid = True + unique_samples = {} + + # Group samples by their label combination + for record in records: + labels = record.get("labels", {}) + # Create a key from relevant labels + key = ( + labels.get("waiting_pid"), + labels.get("blocker_pid"), + labels.get("waiting_table"), + ) + if key not in unique_samples: + unique_samples[key] = record + + print(f" Found {len(unique_samples)} unique lock wait samples") + + for i, (key, record) in enumerate(list(unique_samples.items())[:5]): # Validate first 5 + print(f"\n Sample {i+1}:") + labels = record.get("labels", {}) + metric_name = record.get("metric", "") + value = record.get("value") + + # Check datname matches + if labels.get("datname") != self.test_dbname: + print(f" ⚠ datname mismatch: {labels.get('datname')} != {self.test_dbname}") + else: + print(f" ✓ datname matches: {labels.get('datname')}") + + # Check key labels are present + key_labels = ["waiting_pid", "blocker_pid", "waiting_mode", "blocker_mode"] + missing_labels = [label for label in key_labels if not labels.get(label)] + if missing_labels: + print(f" ⚠ Missing key labels: {missing_labels}") + else: + print(f" ✓ Key labels present") + + # Validate metric value + if value is not None: + try: + float(value) + print(f" ✓ Metric value is numeric: {value}") + if "waiting_ms" in metric_name or "blocker_tx_ms" in metric_name: + print(f" Value: {value} ms") + except (ValueError, TypeError): + print(f" ✗ Metric value is not numeric: {value}") + all_valid = False + else: + print(f" ⚠ Metric value is None") + + return all_valid + + def cleanup(self): + """Clean up test resources.""" + print("\nCleaning up...") + + if self.blocker_conn: + try: + self.blocker_conn.close() + except Exception: + pass + + if self.target_conn: + try: + with self.target_conn.cursor() as cur: + cur.execute("drop table if exists lock_test_table cascade") + self.target_conn.close() + except Exception: + pass + + print("✓ Cleanup complete") + + def run(self) -> bool: + """ + Run the complete test. + + Returns: + True if test passes, False otherwise + """ + try: + self.setup() + self.create_lock_contention(duration_seconds=30) + records = self.verify_metric_collected() + is_valid = self.validate_metric_structure(records) + + if is_valid and records: + print("\n✅ Test PASSED: lock_waits metric is working correctly") + return True + else: + print("\n❌ Test FAILED: lock_waits metric validation failed") + return False + + except Exception as e: + print(f"\n❌ Test ERROR: {e}") + import traceback + + traceback.print_exc() + return False + finally: + self.cleanup() + + +def main(): + """Main entry point for the test.""" + import argparse + + parser = argparse.ArgumentParser( + description="Test lock_waits metric collection" + ) + parser.add_argument( + "--target-db-url", + default=os.getenv( + "TARGET_DB_URL", "postgresql://postgres:postgres@localhost:55432/target_database" + ), + help="Target database connection URL", + ) + parser.add_argument( + "--prometheus-url", + default=os.getenv( + "PROMETHEUS_URL", + "http://localhost:59090", + ), + help="Prometheus/VictoriaMetrics API URL", + ) + parser.add_argument( + "--test-dbname", + default=os.getenv("TEST_DBNAME", "target_database"), + help="Name of the database being monitored", + ) + parser.add_argument( + "--collection-wait", + type=int, + default=int(os.getenv("COLLECTION_WAIT_SECONDS", "60")), + help="Seconds to wait for pgwatch to collect metrics", + ) + + args = parser.parse_args() + + test = LockWaitsTest( + target_db_url=args.target_db_url, + prometheus_url=args.prometheus_url, + test_dbname=args.test_dbname, + collection_wait_seconds=args.collection_wait, + ) + + success = test.run() + exit(0 if success else 1) + + +if __name__ == "__main__": + main() + -- GitLab From 1ce08744c25f613d02781d2f2e999c2ddd1071db Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 2 Dec 2025 14:57:25 +0200 Subject: [PATCH 4/7] refactor(dashboard): rename waits to blocks and update title --- .../dashboards/Dashboard_13_Lock_waits.json | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/config/grafana/dashboards/Dashboard_13_Lock_waits.json index ba6cd84..7b8d319 100644 --- a/config/grafana/dashboards/Dashboard_13_Lock_waits.json +++ b/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -32,7 +32,7 @@ }, "id": 10, "panels": [], - "title": "Lock waits overview", + "title": "Lock blocks overview", "type": "row" }, { @@ -40,7 +40,7 @@ "type": "prometheus", "uid": "P7A0D6631BB10B34F" }, - "description": "Number of active lock waits over time, showing how many processes are waiting for locks.", + "description": "Number of active lock blocks over time, showing how many processes are blocked by locks.", "fieldConfig": { "defaults": { "color": { @@ -136,12 +136,12 @@ }, "editorMode": "code", "expr": "count(pgwatch_lock_waits_blocked_ms{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\"}) or vector(0)", - "legendFormat": "Lock waits", + "legendFormat": "Lock blocks", "range": true, "refId": "A" } ], - "title": "Lock waits count", + "title": "Lock blocks count", "type": "timeseries" }, { @@ -149,7 +149,7 @@ "type": "prometheus", "uid": "P7A0D6631BB10B34F" }, - "description": "Lock wait duration over time, showing how long processes have been waiting for locks.", + "description": "Lock block duration over time, showing how long processes have been blocked by locks.", "fieldConfig": { "defaults": { "color": { @@ -258,7 +258,7 @@ "refId": "B" } ], - "title": "Lock wait duration", + "title": "Lock block duration", "type": "timeseries" }, { @@ -266,7 +266,7 @@ "type": "prometheus", "uid": "P7A0D6631BB10B34F" }, - "description": "Lock waits by lock type, showing which types of locks are causing the most contention.", + "description": "Lock blocks by lock type, showing which types of locks are causing the most contention.", "fieldConfig": { "defaults": { "color": { @@ -356,7 +356,7 @@ "refId": "A" } ], - "title": "Lock waits by lock type", + "title": "Lock blocks by lock type", "type": "timeseries" }, { @@ -481,7 +481,7 @@ "type": "prometheus", "uid": "P7A0D6631BB10B34F" }, - "description": "Lock waits by blocked table, showing which tables have the most lock contention.", + "description": "Lock blocks by table, showing which tables have the most lock contention.", "fieldConfig": { "defaults": { "color": { @@ -571,7 +571,7 @@ "refId": "A" } ], - "title": "Lock waits by table", + "title": "Lock blocks by table", "type": "timeseries" }, { @@ -584,7 +584,7 @@ }, "id": 11, "panels": [], - "title": "Lock waits details", + "title": "Lock blocks details", "type": "row" }, { @@ -592,7 +592,7 @@ "type": "prometheus", "uid": "P7A0D6631BB10B34F" }, - "description": "Detailed lock wait information showing blocked and blocking processes with their users, applications, lock types, tables, query IDs, and wait durations.", + "description": "Detailed lock block information showing blocked and blocking processes with their users, applications, lock types, tables, query IDs, and block durations.", "fieldConfig": { "defaults": { "color": { @@ -961,7 +961,7 @@ "refId": "A" } ], - "title": "Lock waits details", + "title": "Lock blocks details", "transformations": [ { "id": "organize", @@ -1006,11 +1006,7 @@ ], "refresh": "30s", "schemaVersion": 39, - "tags": [ - "locks", - "performance", - "postgres" - ], + "tags": [], "templating": { "list": [ { @@ -1110,7 +1106,7 @@ ] }, "timezone": "", - "title": "Lock waits details", + "title": "13. Lock blocks details", "uid": "lock-waits-details", "version": 1, "weekStart": "" -- GitLab From 3632241875c0250ebd9533ab15b78278488d4be6 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 2 Dec 2025 14:59:03 +0200 Subject: [PATCH 5/7] Changed pgss_queryid_queries metric in pgwatch-postgres to have no limit to enhance query analysis --- config/pgwatch-postgres/metrics.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/pgwatch-postgres/metrics.yml b/config/pgwatch-postgres/metrics.yml index fd3bc3a..9fcd417 100644 --- a/config/pgwatch-postgres/metrics.yml +++ b/config/pgwatch-postgres/metrics.yml @@ -10,7 +10,6 @@ metrics: from pg_stat_statements where queryid is not null order by total_exec_time desc - limit 1000; gauges: - '*' @@ -196,5 +195,5 @@ presets: full: description: "Full metrics for PostgreSQL storage" metrics: - pgss_queryid_queries: 300 + pgss_queryid_queries: 30 index_definitions: 3600 \ No newline at end of file -- GitLab From b02e79ff95064e8e16fe462068e7a00c880a3694 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 2 Dec 2025 15:01:39 +0200 Subject: [PATCH 6/7] Changed the desctiption of Lock blocks graph --- config/grafana/dashboards/Dashboard_13_Lock_waits.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/config/grafana/dashboards/Dashboard_13_Lock_waits.json index 7b8d319..8ad70b8 100644 --- a/config/grafana/dashboards/Dashboard_13_Lock_waits.json +++ b/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -40,7 +40,7 @@ "type": "prometheus", "uid": "P7A0D6631BB10B34F" }, - "description": "Number of active lock blocks over time, showing how many processes are blocked by locks.", + "description": "Number of active sessions waiting on heavyweight lock acquisition.", "fieldConfig": { "defaults": { "color": { -- GitLab From 8f795f7b0317537058ff7f6c6e12dbccdc0c06d1 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 2 Dec 2025 15:08:10 +0200 Subject: [PATCH 7/7] feat(helm): add lock blocks dashboard to helm chart --- .../config/grafana/dashboards/Dashboard_13_Lock_waits.json | 1 + 1 file changed, 1 insertion(+) create mode 120000 postgres_ai_helm/config/grafana/dashboards/Dashboard_13_Lock_waits.json diff --git a/postgres_ai_helm/config/grafana/dashboards/Dashboard_13_Lock_waits.json b/postgres_ai_helm/config/grafana/dashboards/Dashboard_13_Lock_waits.json new file mode 120000 index 0000000..f01f2bd --- /dev/null +++ b/postgres_ai_helm/config/grafana/dashboards/Dashboard_13_Lock_waits.json @@ -0,0 +1 @@ +../../../../config/grafana/dashboards/Dashboard_13_Lock_waits.json \ No newline at end of file -- GitLab