commit 169b40bff497eff780df28ba010cdf3564e62cdd
Author: zhuyifei1999 <zhuyifei1999@gmail.com>
Date:   Mon Jan 30 18:01:16 2017 +0800

    tail optimization: Use bytes instead of lines
    
    Lines tailing require many read and seeks in order to find the
    position of starting line, and then a simple copy-to-stdout. The
    first is extremely expensive and unnecessary, since we do not
    require an accurate starting line.
    
    Byte tailing removes this massive overhead by doing a seek with
    SEEK_END, and the performance is much much higher.
    
    The last 10000 lines has 3679657 bytes. Rounding up, we change each
    line to 400 bytes.

----

diff --git a/precise_tools/__init__.py b/precise_tools/__init__.py
index abe09de..6af899b 100644
--- a/precise_tools/__init__.py
+++ b/precise_tools/__init__.py
@@ -38,7 +38,16 @@ def tools_from_accounting(days):
     delta = datetime.timedelta(days=days)
     cutoff = int(utils.totimestamp(datetime.datetime.now() - delta))
     tools = []
-    for line in utils.tail('/data/project/.system/accounting', 45000 * days):
+
+    # Bytes are much much more faster to tail, as it can be seek()-ed; Ignore
+    # first line when tailing with bytes, as it could be only part of a line
+    firstline = True
+    for line in utils.tail('/data/project/.system/accounting',
+                           400 * 45000 * days, tailmode='bytes'):
+        if firstline:
+            firstline = False
+            continue
+
         parts = line.split(':')
         job = dict(zip(ACCOUNTING_FIELDS, parts))
         if int(job['end_time']) < cutoff:
diff --git a/precise_tools/utils.py b/precise_tools/utils.py
index 8b1ae65..b1e6804 100644
--- a/precise_tools/utils.py
+++ b/precise_tools/utils.py
@@ -21,10 +21,10 @@ import datetime
 import subprocess
 
 
-def tail(filename, lines):
+def tail(filename, lines, tailmode='lines'):
     """Get last n lines from the filename as an iterator."""
     # Inspired by http://stackoverflow.com/a/4418193/8171
-    cmd = ['tail', '-%d' % lines, filename]
+    cmd = ['tail', '--%s=%d' % (tailmode, lines), filename]
     proc = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     while True: