Page MenuHomePhabricator
Paste P12561

puppet-merge
ActivePublic

Authored by jcrespo on Thu, Sep 10, 2:23 PM.
root@puppetmaster1001:~$ puppet-merge
Fetching new commits from: https://gerrit.wikimedia.org/r/labs/private
No changes to merge.
Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet
diff --git a/hieradata/hosts/db2141.yaml b/hieradata/hosts/db2141.yaml
new file mode 100644
index 0000000000..f3eed881d9
--- /dev/null
+++ b/hieradata/hosts/db2141.yaml
@@ -0,0 +1,6 @@
+# db2141
+# Buffer pool sizes/instance enabled
+profile::mariadb::dbstore_multiinstance::num_instances: 2
+profile::mariadb::dbstore_multiinstance::s1: '192G'
+profile::mariadb::dbstore_multiinstance::s6: '192G'
+profile::base::notifications: disabled
diff --git a/manifests/site.pp b/manifests/site.pp
index 2112a04c4d..71c593b362 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -472,11 +472,6 @@ node 'db2140.codfw.wmnet' {
role(mariadb::core)
}
-# codfw MySQL source backups expansion T260819
-node 'db2141.codfw.wmnet' {
- role(insetup)
-}
-
# codfw replicas
# See also db2137 and db2138 below
node /^db2(073|106|110|119|136)\.codfw\.wmnet/ {
@@ -726,25 +721,35 @@ node 'db1145.eqiad.wmnet' {
# codfw backup sources
-
+## s1 & s6, stretch
node 'db2097.codfw.wmnet' {
role(mariadb::dbstore_multiinstance)
}
+## s2 & s3, stretch
node 'db2098.codfw.wmnet' {
role(mariadb::dbstore_multiinstance)
}
+## s4 & s5, stretch
node 'db2099.codfw.wmnet' {
role(mariadb::dbstore_multiinstance)
}
+## s7 & s8, stretch
node 'db2100.codfw.wmnet' {
role(mariadb::dbstore_multiinstance)
}
+## x1, buster
node 'db2101.codfw.wmnet' {
role(mariadb::dbstore_multiinstance)
}
+## s4 & s5, buster
node 'db2139.codfw.wmnet' {
role(mariadb::dbstore_multiinstance)
}
+## s1 & s6, buster
+node 'db2141.codfw.wmnet' {
+ role(mariadb::dbstore_multiinstance)
+}
+
# backup testing hosts
node 'db1133.eqiad.wmnet' {
diff --git a/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py b/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py
index f4ed730bd4..8a2912b564 100644
--- a/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py
+++ b/modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py
@@ -15,7 +15,7 @@ log = logging.getLogger(__name__)
def collect_stats_from_romc_smi(registry, rocm_smi_path):
out = subprocess.run([
rocm_smi_path, "--showuse", "--showpower",
- "--showtemp", "--showfan", "--json"
+ "--showtemp", "--showfan", "--showmeminfo", "all", "--json"
], capture_output=True, text=True)
rocm_metrics = {}
for line in out.stdout.splitlines():
@@ -47,6 +47,14 @@ def collect_stats_from_romc_smi(registry, rocm_smi_path):
namespace='amd_rocm_gpu', registry=registry
)
+ gpu_stats['memory_total'] = Gauge(
+ 'memory_total_bytes', 'Total GPU memory (bytes)', ['card', 'memtype'],
+ namespace='amd_rocm_gpu', registry=registry
+ )
+ gpu_stats['memory_used'] = Gauge(
+ 'memory_used_bytes', 'Used GPU memory (bytes)', ['card', 'memtype'],
+ namespace='amd_rocm_gpu', registry=registry
+ )
for card in rocm_metrics:
for metric in rocm_metrics[card]:
# General usage
@@ -86,6 +94,30 @@ def collect_stats_from_romc_smi(registry, rocm_smi_path):
elif metric == 'Fan Speed (level)':
# we care only about the percentage value
continue
+
+ # Memory
+ # Total memory amounts, for percentage calculation with used memory
+ elif metric == 'vram Total Memory (B)':
+ gpu_stats['memory_total'].labels(card=card, memtype='vram').set(
+ rocm_metrics[card][metric].strip())
+ elif metric == 'gtt Total Memory (B)':
+ gpu_stats['memory_total'].labels(card=card, memtype='gtt').set(
+ rocm_metrics[card][metric].strip())
+ elif metric == 'vis_vram Total Memory (B)':
+ gpu_stats['memory_total'].labels(card=card, memtype='vis').set(
+ rocm_metrics[card][metric].strip())
+ # Used memory amounts
+ elif metric == 'vram Total Used Memory (B)':
+ gpu_stats['memory_used'].labels(card=card, memtype='vram').set(
+ rocm_metrics[card][metric].strip())
+ elif metric == 'gtt Total Used Memory (B)':
+ gpu_stats['memory_used'].labels(card=card, memtype='gtt').set(
+ rocm_metrics[card][metric].strip())
+ elif metric == 'vis_vram Total Used Memory (B)':
+ gpu_stats['memory_used'].labels(card=card, memtype='vis').set(
+ rocm_metrics[card][metric].strip())
+
+ # Unknown stuff should emit a warning (to be delivered by cron mail)
else:
log.warning(
"Metric {} listed in rocm-smi's JSON but not parsed"
Jcrespo: mariadb-backups: Add db2141 to the dbstore role for backup source (3330487795)
Tobias Klausmann: prometheus: Add more stats to AMD ROCm GPU exporter (3c649a46c5)
WARNING: Revision range includes commits from multiple committers!
Merge these changes? (multiple/no)? multiple
HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1
Updating f9fb51703b..3330487795
Fast-forward
hieradata/hosts/db2141.yaml | 6 ++++++
manifests/site.pp | 17 +++++++++++------
modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 50 insertions(+), 7 deletions(-)
create mode 100644 hieradata/hosts/db2141.yaml
Running git clean to clean any untracked files.
All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097
No LABS changes to merge
===> Starting run on puppetmaster1002.eqiad.wmnet...
Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet
HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1
Updating f9fb51703b..3330487795
Fast-forward
hieradata/hosts/db2141.yaml | 6 ++++++
manifests/site.pp | 17 +++++++++++------
modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 50 insertions(+), 7 deletions(-)
create mode 100644 hieradata/hosts/db2141.yaml
Running git clean to clean any untracked files.
All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097
Connection to puppetmaster1002.eqiad.wmnet closed.
OK: puppet-merge on puppetmaster1002.eqiad.wmnet (ops) succeeded
===> Starting run on puppetmaster1003.eqiad.wmnet...
Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet
HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1
Updating f9fb51703b..3330487795
Fast-forward
hieradata/hosts/db2141.yaml | 6 ++++++
manifests/site.pp | 17 +++++++++++------
modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 50 insertions(+), 7 deletions(-)
create mode 100644 hieradata/hosts/db2141.yaml
Running git clean to clean any untracked files.
All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097
Connection to puppetmaster1003.eqiad.wmnet closed.
OK: puppet-merge on puppetmaster1003.eqiad.wmnet (ops) succeeded
===> Starting run on puppetmaster2001.codfw.wmnet...
Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet
HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1
Updating f9fb51703b..3330487795
Fast-forward
hieradata/hosts/db2141.yaml | 6 ++++++
manifests/site.pp | 17 +++++++++++------
modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 50 insertions(+), 7 deletions(-)
create mode 100644 hieradata/hosts/db2141.yaml
Running git clean to clean any untracked files.
All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097
Connection to puppetmaster2001.codfw.wmnet closed.
OK: puppet-merge on puppetmaster2001.codfw.wmnet (ops) succeeded
===> Starting run on puppetmaster2002.codfw.wmnet...
Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet
HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1
Updating f9fb51703b..3330487795
Fast-forward
hieradata/hosts/db2141.yaml | 6 ++++++
manifests/site.pp | 17 +++++++++++------
modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 50 insertions(+), 7 deletions(-)
create mode 100644 hieradata/hosts/db2141.yaml
Running git clean to clean any untracked files.
All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097
Connection to puppetmaster2002.codfw.wmnet closed.
OK: puppet-merge on puppetmaster2002.codfw.wmnet (ops) succeeded
===> Starting run on puppetmaster2003.codfw.wmnet...
Fetching new commits from: https://gerrit.wikimedia.org/r/operations/puppet
HEAD is currently f9fb51703b55a1f1a64ca5c97576607d2bc048b1
Updating f9fb51703b..3330487795
Fast-forward
hieradata/hosts/db2141.yaml | 6 ++++++
manifests/site.pp | 17 +++++++++++------
modules/prometheus/files/usr/local/bin/prometheus-amd-rocm-stats.py | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 50 insertions(+), 7 deletions(-)
create mode 100644 hieradata/hosts/db2141.yaml
Running git clean to clean any untracked files.
All done! HEAD is now 33304877950b8c60c8f161e57af076ee5e3f9097
Connection to puppetmaster2003.codfw.wmnet closed.
OK: puppet-merge on puppetmaster2003.codfw.wmnet (ops) succeeded
Now running conftool-merge to sync any changes to conftool data
Running conftool-sync on /etc/conftool/data
2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity node from /etc/conftool/data
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/codfw.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/eqsin.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/eqiad.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/esams.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/node/ulsfo.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity discovery from /etc/conftool/data
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/discovery/services.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/discovery/mediawiki.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity mwconfig from /etc/conftool/data
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/mwconfig/data.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity dbconfig-instance from /etc/conftool/data
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/dbconfig-instance/instances.yaml
2020-09-10 14:21:07 [INFO] conftool::load_files: Loading data for entity dbconfig-section from /etc/conftool/data
2020-09-10 14:21:07 [INFO] conftool::load_files: Parsing file /etc/conftool/data/dbconfig-section/sections.yaml
2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for node
2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for discovery
2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for mwconfig
2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for dbconfig-instance
2020-09-10 14:21:07 [INFO] conftool::load: Adding objects for dbconfig-section
2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for dbconfig-section
2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for dbconfig-instance
2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for mwconfig
2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for discovery
2020-09-10 14:21:07 [INFO] conftool::load: Removing stale objects for node
Done.

Event Timeline

jcrespo created this paste.Thu, Sep 10, 2:23 PM