i’ll skip part with docker installation and put here only config files
[root@prod-oracle-rac grafana]# tree
.
├── 1.sh
├── alertmanager
│ ├── alert.tmpl
│ ├── config.yml
│ └── data
│ ├── nflog
│ └── silences
├── docker-compose.yaml
├── grafana
│ ├── alerting
│ │ └── 1
│ │ └── default.tmpl
│ ├── csv
│ ├── file-collections
│ ├── grafana.db
│ ├── plugins
│ ├── png
│ └── provisioning
├── oracledbexporter
│ └── custom-metrics.toml
└── prometheus
├── alert_rules.yml
└── prometheus.yml
cat docker-compose.yaml
don’t forget to change username\userpass in tns connect string
version: '3.3'
services:
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus/:/etc/prometheus/
container_name: prometheus
hostname: prometheus
command: ["--config.file=/etc/prometheus/prometheus.yml", "--log.level=debug"]
#command:
# - --config.file=/etc/prometheus/prometheus.yml
# - --LOG_LEVEL=debug
links:
- alertmanager:alertmanager
ports:
- 9090:9090
restart: unless-stopped
environment:
TZ: "Europe/Moscow"
networks:
- default
depends_on:
- alertmanager
node-exporter:
image: prom/node-exporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
container_name: exporter
hostname: exporter
command:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --collector.filesystem.ignored-mount-points
- ^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)
ports:
- 9100:9100
restart: unless-stopped
environment:
TZ: "Europe/Moscow"
networks:
- default
grafana:
image: grafana/grafana
restart: always
user: root
depends_on:
- prometheus
ports:
- 3000:3000
volumes:
- ./grafana:/var/lib/grafana
- ./grafana/provisioning/:/etc/grafana/provisioning/
container_name: grafana
hostname: grafana
restart: unless-stopped
environment:
TZ: "Europe/Moscow"
networks:
- default
alertmanager:
image: prom/alertmanager:latest
user: root
ports:
- 127.0.0.1:9093:9093
volumes:
- ./alertmanager/:/etc/alertmanager/
container_name: alertmanager
hostname: alertmanager
environment:
TZ: "Europe/Moscow"
restart: unless-stopped
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/etc/alertmanager/data'
networks:
- default
oracledbexporter:
image: iamseth/oracledb_exporter
volumes:
- ./oracledbexporter:/etc/oracledb_exporter
environment:
- 'NLS_LANG=AMERICAN_AMERICA.WE8ISO8859P1'
- 'DATA_SOURCE_NAME=username/userpass@10.100.18.11:1521/a49prm'
- 'CUSTOM_METRICS=/etc/oracledb_exporter/custom-metrics.toml'
restart: unless-stopped
expose:
- 9161
network_mode: host
networks:
default:
ipam:
driver: default
config:
- subnet: 172.28.0.0/16
alertmanager config.yaml
# Отсюда читаем все шаблоны:
templates:
- '/etc/alertmanager/template/*.tmpl'
route:
# Группировка алертов
group_by: ['alertname', 'cluster', 'service']
# время ожидания перед отправкой уведомления для группы
group_wait: 30s
# время отправки повторного сообщения для группы
group_interval: 5m
# время до отправки повторного сообщения
repeat_interval: 30m
receiver: 'telega'
receivers:
- name: 'telega'
telegram_configs:
- bot_token: 'xxxx:xxxxxxxxxxx'
chat_id: -xxxxxxxxxxxxxx
api_url: 'https://api.telegram.org'
message: "Alertname: {{ .GroupLabels.alertname }}\n Severity: {{ .CommonLabels.severity }}\n {{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}"
parse_mode: ''
send_resolved: true
prometheus prometheus.yml
scrape_configs:
- job_name: oracle_rac
scrape_interval: 5s
static_configs:
- targets: ['node-exporter:9100']
- job_name: oracle_db12
scrape_interval: 5s
static_configs:
- targets: ['10.100.18.12:9100']
- job_name: oracle_db11
scrape_interval: 5s
static_configs:
- targets: ['10.100.18.11:9100']
- job_name: 'oracle-exporter'
scrape_interval: 10s
scrape_timeout: 8s
static_configs:
- targets: ['10.100.18.10:9161']
rule_files:
- alert_rules.yml
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
prometheus alert_rules.yml
groups:
- name: nodeexporter
rules:
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 400
for: 10m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (>400 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 400
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (>400 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.01 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 10ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.01 and rate(node_disk_writes_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 10ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: (node_bonding_active - node_bonding_slaves) != 0
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: node_reboot_required > 0
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RmanBackupFailed
expr: oracledb_rman_backup_status_value > 3
for: 2m
labels:
severity: critical
annotations:
summary: oracle db backup failed (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has encountered error during backup \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: instance_down
expr: oracledb_up{instance="10.100.18.10:9161"} <1
for: 0m
labels:
severity: critical
annotations:
summary: oracle db Down failed (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} unreacheable \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
oracledbexporter custom-metrics.toml
[[metric]]
context = "system"
request = "select count(*) as session_count from v$session where username is not null and type = 'USER' and con_id = sys_context('userenv','con_id')"
metricsdesc = { session_count = "Current session count." }
[[metric]]
context = "system"
request = "select count(*) as active_sessions from v$session where username is not null and type = 'USER' and status = 'ACTIVE' and con_id = sys_context('userenv','con_id')"
metricsdesc = { active_sessions = "Active sessions." }
[[metric]]
context = "system"
request = "select (c.session_count - a.active_sessions) as inactive_sessions from (select count(*) as session_count from v$session where username is not null and type = 'USER' and con_id = sys_context('userenv','con_id')) c, (select count(*) as active_sessions from v$session where username is not null and type = 'USER' and status = 'ACTIVE' and con_id = sys_context('userenv','con_id')) a"
metricsdesc = { inactive_sessions = "Inactive sessions." }
[[metric]]
context = "system"
request = "select b.session_count as blocked_sessions from (select count(*) as session_count from v$session where username is not null and type = 'USER' and blocking_session_status = 'VALID' and con_id = sys_context('userenv','con_id')) b"
metricsdesc = { blocked_sessions = "Blocked sessions." }
[[metric]]
context = "rman_backup_status"
labels = [ "start_time", "input_type" ]
metricsdesc = { value="Gauge metric with rman backup status (5:FAILED; 4:RUNNING WITH ERRORS; 3:COMPLETED WITH ERRORS; 2:RUNNING WITH WARNINGS; 1:COMPLETED WITH WARNINGS; 0:COMPLETED)." }
request = '''
SELECT to_char(start_time, 'yyyy-mm-dd hh24:mi:ss') as start_time
, input_type
, cast( decode(status, 'FAILED', 5, 'RUNNING WITH ERRORS', 4, 'COMPLETED WITH ERRORS', 3, 'RUNNING WITH WARNINGS', 2, 'COMPLETED WITH WARNINGS', 1, 0) as integer) as value
FROM v$rman_backup_job_details
WHERE start_time = (SELECT max(start_time) FROM v$rman_backup_job_details)
'''
on target oracle host
wget https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz
724 28/03/23 08:15:16 ll /usr/local/bin/node_exporter
725 28/03/23 08:15:23 tar -xzvf node_exporter-1.5.0.linux-amd64.tar.gz
726 28/03/23 08:15:33 mv node_exporter-1.5.0.linux-amd64/node_exporter /usr/local/bin/
[root@prod-db1 ~]# cat /etc/systemd/system/node_exporter.service
[Unit]
Description=Node Exporter
[Service]
User=node_exporter
EnvironmentFile=/etc/sysconfig/node_exporter
ExecStart=/usr/local/bin/node_exporter $OPTIONS
[Install]
WantedBy=multi-user.target
727 28/03/23 08:15:44 systemctl status node_exporter
728 28/03/23 08:15:48 systemctl start node_exporter
729 28/03/23 08:15:50 systemctl status node_exporter
result
Usefull links