Page MenuHomePhabricator

enwiki "enqueue" queue showed corruption
Closed, ResolvedPublic

Description

Some partitions have many unclaimed job IDs with no actual data (similar to T121568).

> require("/home/aaron/eval_job_check.php");

> foreach ( $wmfLocalServices['jobqueue_redis'] as $tag => $host ) { sanityCheckJQHost( $host, 'enqueue' ); }
rdb2001.codfw.wmnet:6379
array(8) {
  ["used_memory"]=>
  int(2266250744)
  ["used_memory_human"]=>
  string(5) "2.11G"
  ["used_memory_rss"]=>
  int(2441019392)
  ["used_memory_peak"]=>
  int(2377221376)
  ["used_memory_peak_human"]=>
  string(5) "2.21G"
  ["used_memory_lua"]=>
  int(55296)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.08"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 251215
z-claimed 77190
z-delayed 0
h-idBySha1 65657
h-sha1ById 65610
h-attempts 787181
h-data 65610
rdb2001.codfw.wmnet:6380
array(8) {
  ["used_memory"]=>
  int(1099634560)
  ["used_memory_human"]=>
  string(5) "1.02G"
  ["used_memory_rss"]=>
  int(1155829760)
  ["used_memory_peak"]=>
  int(1121073792)
  ["used_memory_peak_human"]=>
  string(5) "1.04G"
  ["used_memory_lua"]=>
  int(64512)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.05"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0
rdb2001.codfw.wmnet:6381
array(8) {
  ["used_memory"]=>
  int(1103223200)
  ["used_memory_human"]=>
  string(5) "1.03G"
  ["used_memory_rss"]=>
  int(1160331264)
  ["used_memory_peak"]=>
  int(1124962784)
  ["used_memory_peak_human"]=>
  string(5) "1.05G"
  ["used_memory_lua"]=>
  int(73728)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.05"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0
rdb2003.codfw.wmnet:6379
array(8) {
  ["used_memory"]=>
  int(2272936920)
  ["used_memory_human"]=>
  string(5) "2.12G"
  ["used_memory_rss"]=>
  int(2442838016)
  ["used_memory_peak"]=>
  int(2413462864)
  ["used_memory_peak_human"]=>
  string(5) "2.25G"
  ["used_memory_lua"]=>
  int(44032)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.07"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 254738
z-claimed 77573
z-delayed 0
h-idBySha1 69790
h-sha1ById 69743
h-attempts 787181
h-data 69743
rdb2003.codfw.wmnet:6380
array(8) {
  ["used_memory"]=>
  int(2274765784)
  ["used_memory_human"]=>
  string(5) "2.12G"
  ["used_memory_rss"]=>
  int(2451623936)
  ["used_memory_peak"]=>
  int(2414626192)
  ["used_memory_peak_human"]=>
  string(5) "2.25G"
  ["used_memory_lua"]=>
  int(57344)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.08"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 256081
z-claimed 77097
z-delayed 0
h-idBySha1 70163
h-sha1ById 70116
h-attempts 787181
h-data 70116
rdb2003.codfw.wmnet:6381
array(8) {
  ["used_memory"]=>
  int(2272924616)
  ["used_memory_human"]=>
  string(5) "2.12G"
  ["used_memory_rss"]=>
  int(2443112448)
  ["used_memory_peak"]=>
  int(2414601216)
  ["used_memory_peak_human"]=>
  string(5) "2.25G"
  ["used_memory_lua"]=>
  int(61440)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.07"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 255766
z-claimed 77127
z-delayed 0
h-idBySha1 70123
h-sha1ById 70076
h-attempts 787181
h-data 70076
rdb2005.codfw.wmnet:6479
array(8) {
  ["used_memory"]=>
  int(2159071760)
  ["used_memory_human"]=>
  string(5) "2.01G"
  ["used_memory_rss"]=>
  int(2237444096)
  ["used_memory_peak"]=>
  int(2203875248)
  ["used_memory_peak_human"]=>
  string(5) "2.05G"
  ["used_memory_lua"]=>
  int(52224)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.04"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0
rdb2005.codfw.wmnet:6480
array(8) {
  ["used_memory"]=>
  int(1103460912)
  ["used_memory_human"]=>
  string(5) "1.03G"
  ["used_memory_rss"]=>
  int(1159933952)
  ["used_memory_peak"]=>
  int(1204699312)
  ["used_memory_peak_human"]=>
  string(5) "1.12G"
  ["used_memory_lua"]=>
  int(56320)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.05"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0
rdb2005.codfw.wmnet:6481
array(8) {
  ["used_memory"]=>
  int(1103068560)
  ["used_memory_human"]=>
  string(5) "1.03G"
  ["used_memory_rss"]=>
  int(1159184384)
  ["used_memory_peak"]=>
  int(1128526448)
  ["used_memory_peak_human"]=>
  string(5) "1.05G"
  ["used_memory_lua"]=>
  int(68608)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.05"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0
rdb2005.codfw.wmnet:6379
array(8) {
  ["used_memory"]=>
  int(1107099488)
  ["used_memory_human"]=>
  string(5) "1.03G"
  ["used_memory_rss"]=>
  int(1162727424)
  ["used_memory_peak"]=>
  int(1131211728)
  ["used_memory_peak_human"]=>
  string(5) "1.05G"
  ["used_memory_lua"]=>
  int(65536)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.05"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0
rdb2005.codfw.wmnet:6380
array(8) {
  ["used_memory"]=>
  int(1103509392)
  ["used_memory_human"]=>
  string(5) "1.03G"
  ["used_memory_rss"]=>
  int(1165008896)
  ["used_memory_peak"]=>
  int(1124012624)
  ["used_memory_peak_human"]=>
  string(5) "1.05G"
  ["used_memory_lua"]=>
  int(55296)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.06"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0
rdb2005.codfw.wmnet:6381
array(8) {
  ["used_memory"]=>
  int(1106143248)
  ["used_memory_human"]=>
  string(5) "1.03G"
  ["used_memory_rss"]=>
  int(1164288000)
  ["used_memory_peak"]=>
  int(1203146960)
  ["used_memory_peak_human"]=>
  string(5) "1.12G"
  ["used_memory_lua"]=>
  int(56320)
  ["mem_fragmentation_ratio"]=>
  string(4) "1.05"
  ["mem_allocator"]=>
  string(14) "jemalloc-3.6.0"
}
l-unclaimed 0
z-claimed 0
z-delayed 0
h-idBySha1 0
h-sha1ById 0
h-attempts 0
h-data 0

Event Timeline

aaron created this task.Apr 19 2016, 7:10 PM
Restricted Application added subscribers: TerraCodes, Aklapper. · View Herald TranscriptApr 19 2016, 7:10 PM

This makes pop() return false AFAIK, effectively de-prioritizing the queue. I'll just clear them.

Mentioned in SAL [2016-04-19T19:12:11Z] <AaronSchulz> Cleared enwiki 'enqueue' queue (T133089)

Change 284471 had a related patch set uploaded (by Aaron Schulz):
Avoid ack() race condition in JobQueueRedis

https://gerrit.wikimedia.org/r/284471

Change 284471 merged by jenkins-bot:
Avoid ack() race condition in JobQueueRedis

https://gerrit.wikimedia.org/r/284471

Change 284488 had a related patch set uploaded (by Aaron Schulz):
Avoid ack() race condition in JobQueueRedis

https://gerrit.wikimedia.org/r/284488

Change 284488 merged by jenkins-bot:
Avoid ack() race condition in JobQueueRedis

https://gerrit.wikimedia.org/r/284488

aaron added a comment.Apr 20 2016, 7:46 PM

This still leaves the question of how such fast jobs had the chance to hit this race condition...probably being recycled too soon due to some timestamp issue perhaps...

aaron closed this task as Resolved.Apr 21 2016, 9:01 PM
aaron claimed this task.