commit 169b40bff497eff780df28ba010cdf3564e62cdd Author: zhuyifei1999 Date: Mon Jan 30 18:01:16 2017 +0800 tail optimization: Use bytes instead of lines Lines tailing require many read and seeks in order to find the position of starting line, and then a simple copy-to-stdout. The first is extremely expensive and unnecessary, since we do not require an accurate starting line. Byte tailing removes this massive overhead by doing a seek with SEEK_END, and the performance is much much higher. The last 10000 lines has 3679657 bytes. Rounding up, we change each line to 400 bytes. ---- diff --git a/precise_tools/__init__.py b/precise_tools/__init__.py index abe09de..6af899b 100644 --- a/precise_tools/__init__.py +++ b/precise_tools/__init__.py @@ -38,7 +38,16 @@ def tools_from_accounting(days): delta = datetime.timedelta(days=days) cutoff = int(utils.totimestamp(datetime.datetime.now() - delta)) tools = [] - for line in utils.tail('/data/project/.system/accounting', 45000 * days): + + # Bytes are much much more faster to tail, as it can be seek()-ed; Ignore + # first line when tailing with bytes, as it could be only part of a line + firstline = True + for line in utils.tail('/data/project/.system/accounting', + 400 * 45000 * days, tailmode='bytes'): + if firstline: + firstline = False + continue + parts = line.split(':') job = dict(zip(ACCOUNTING_FIELDS, parts)) if int(job['end_time']) < cutoff: diff --git a/precise_tools/utils.py b/precise_tools/utils.py index 8b1ae65..b1e6804 100644 --- a/precise_tools/utils.py +++ b/precise_tools/utils.py @@ -21,10 +21,10 @@ import datetime import subprocess -def tail(filename, lines): +def tail(filename, lines, tailmode='lines'): """Get last n lines from the filename as an iterator.""" # Inspired by http://stackoverflow.com/a/4418193/8171 - cmd = ['tail', '-%d' % lines, filename] + cmd = ['tail', '--%s=%d' % (tailmode, lines), filename] proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while True: