Page MenuHomePhabricator
Paste P1995

cassandra vs postgres benchmarker
ActivePublic

Authored by akosiaris on Sep 8 2015, 11:41 AM.
Tags
None
Referenced Files
F2631292: cassandra vs postgres benchmarker
Sep 24 2015, 3:09 PM
F2557229: cassandra vs postgres benchmarker
Sep 8 2015, 2:15 PM
F2557073: cassandra vs postgres benchmarker
Sep 8 2015, 1:06 PM
F2556919: cassandra vs postgres benchmarker
Sep 8 2015, 12:21 PM
F2556915: cassandra vs postgres benchmarker
Sep 8 2015, 12:20 PM
F2556871: cassandra vs postgres benchmarker
Sep 8 2015, 12:04 PM
F2556822: cassandra vs postgres benchmarker
Sep 8 2015, 11:46 AM
F2556803: cassandra vs postgres benchmarker
Sep 8 2015, 11:41 AM
Subscribers
Postgres table
==============
gis=# \d tiles
Table "public.tiles"
Column | Type | Modifiers
--------+--------+-----------
zoom | bigint |
block | bigint |
idx | bigint |
tile | bytea |
Indexes:
"tiles_zoom_block_idx_idx" UNIQUE, btree (zoom, block, idx)
Cassandra table
===============
cqlsh:v2> describe tiles;
CREATE TABLE v2.tiles (
zoom int,
block int,
idx int,
tile blob,
PRIMARY KEY (zoom, block, idx)
) WITH CLUSTERING ORDER BY (block ASC, idx ASC)
AND bloom_filter_fp_chance = 0.01
AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
AND comment = ''
AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'}
AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND dclocal_read_repair_chance = 0.1
AND default_time_to_live = 0
AND gc_grace_seconds = 864000
AND max_index_interval = 2048
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair_chance = 0.0
AND speculative_retry = '99.0PERCENTILE';
replication factor for the keyspace is 4 so
nodetool status v2
Datacenter: codfw
=================
Status=Up/Down
|/ State=Normal/Leaving/Joining/Moving
-- Address Load Tokens Owns (effective) Host ID Rack
UN 10.192.0.128 122.99 GB 256 100.0% e33a1800-96d9-4a6d-86d3-e1d270026d4a rack1
UN 10.192.0.129 124.58 GB 256 100.0% f74287ed-8286-459e-956c-e0aed02184c4 rack1
UN 10.192.16.34 110.34 GB 256 100.0% 3e5ff43e-df80-48fe-9302-2a28b1127095 rack1
UN 10.192.16.35 114.19 GB 256 100.0% 1af61427-f70e-4653-bccd-71735e5f78da rack1
Code
====
import psycopg2
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import random
from multiprocessing import Process, Value, cpu_count
import time
import os
import argparse
WARMUPS = 1
NUMBER_OF_QUERIES_PER_PROC = 25000
HOSTS = ['maps-test2001.codfw.wmnet',
'maps-test2002.codfw.wmnet',
'maps-test2003.codfw.wmnet',
'maps-test2004.codfw.wmnet']
def benchmark_cassandra_worker(host, username, password, queries, ret):
# Setup cassandra
auth_provider = PlainTextAuthProvider(
username=username, password=password)
cluster = Cluster([host], auth_provider=auth_provider)
session = cluster.connect('v2')
statement = session.prepare('SELECT zoom, block, idx, tile FROM tiles \
WHERE zoom=? \
AND block=? \
AND idx=?')
start_time = time.time()
for query in queries:
session.execute(statement,
(query['zoom'], query['block'], query['idx']))
end_time = time.time()
duration = end_time - start_time
qps = len(queries) / duration
ret.value = qps
return qps
def benchmark_postgresql_worker(host, username, password, queries, ret):
# Setup postgresql
conn = psycopg2.connect('host=%s dbname=gis user=%s password=%s' %
(host, username, password))
cur = conn.cursor()
start_time = time.time()
sql = 'SELECT zoom, block, idx, tile FROM tiles \
WHERE zoom=%(zoom)s \
AND block=%(block)s \
AND idx=%(idx)s'
for query in queries:
cur.execute(sql, query)
end_time = time.time()
cur.close()
conn.close()
duration = end_time - start_time
qps = len(queries) / duration
ret.value = qps
return qps
def benchmark(func, username, password, queries, nr_procs):
print 'Benchmarking %s' % func.func_name
# Benchmarking cassandra
processes = []
for i in range(nr_procs):
host = HOSTS[i % len(HOSTS)]
start_query = i * len(queries) / nr_procs
end_query = ( i + 1 ) * len(queries) / nr_procs - 1
v = Value('d', 0)
p = Process(target=func,
args=(host, username, password, queries[start_query:end_query], v))
p.start()
processes.append({'p': p, 'v': v})
# Let's wait for everyone to finish
for proc in processes:
proc['p'].join()
qps = 0
for proc in processes:
qps = qps + proc['v'].value
print 'Benchmarking %s: done' % func.func_name
return qps
def main(args):
for nr_procs in range(args.min_procs, args.max_procs + 1):
nr_queries = NUMBER_OF_QUERIES_PER_PROC * nr_procs
queries = []
# First generate all our queries
print 'Creating queries'
if args.mode == 'precomputed':
with open('tiles.csv', 'r') as f:
for i in range(1, nr_queries):
line = f.readline()
elems = line.split(',')
elems = [int(i) for i in elems]
# And let's splice
zoom, block, idx = elems
query = {
'zoom': zoom,
'block': block,
'idx': idx,
}
queries.append(query)
else:
# Below are zoom levels and mins, maxes for block and idx.
# zoom | min_block | max_block | min_idx | max_idx
limits = (
(0, 0, 0, 0, 1),
(1, 0, 0, 0, 3),
(2, 0, 0, 0, 15),
(3, 0, 0, 0, 61),
(4, 0, 0, 1, 247),
(5, 0, 0, 5, 1012),
(6, 0, 0, 21, 4081),
(7, 0, 0, 85, 16326),
(8, 0, 1, 341, 65312),
(9, 0, 7, 1365, 262143),
(10, 0, 31, 5461, 1048575),
(11, 0, 127, 21845, 4194303),
(12, 2, 511, 87381, 16777215),
(13, 10, 2047, 349525, 67108863),
(14, 42, 8191, 1398101, 268435455),
(15, 170, 32767, 5592405, 1073741823),
(16, 682, 65535, 22369621, 2147483647),
)
for i in range(1, nr_queries):
z = random.choice(limits)
query = {
'zoom': z[0],
'block': int(round(random.uniform(z[1], z[2]))) if z[1] != z[2] else 0,
'idx': int(round(random.uniform(z[3], z[4]))),
}
queries.append(query)
print 'Queries created'
cassandra_qps_list = []
postgresql_qps_list = []
# Repeat the exact same test random multiple times per datastore to
# statistically remove the effects of pagecache
for worker in (
(
benchmark_cassandra_worker,
args.cassandra_user,
args.cassandra_pass,
cassandra_qps_list,
),
(
benchmark_postgresql_worker,
args.postgres_user,
args.postgres_pass,
postgresql_qps_list,
)):
for i in range(WARMUPS + 1):
qps = benchmark(worker[0], worker[1], worker[2], queries, nr_procs)
worker[3].append(qps)
cassandra_qps = sum(cassandra_qps_list)/len(cassandra_qps_list)
postgresql_qps = sum(postgresql_qps_list)/len(postgresql_qps_list)
print 'Thread Number: %s' % nr_procs
print 'Cassandra got: %s QPS' % cassandra_qps
print 'PostgreSQL got: %s QPS' % postgresql_qps
with open('results.csv', 'a') as f:
# Yes writing CSV without using the CSV module. Could obviously be
# done better but this is too simple and cheap to really care
f.write('%s,%s,%s\n' % (nr_procs, cassandra_qps, postgresql_qps))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Benchmark maps backends')
parser.add_argument('mode', choices=['precomputed', 'random'])
parser.add_argument('--min_procs', type=int, default=1)
parser.add_argument('--max_procs', type=int, default=1)
parser.add_argument('--cassandra_user', default='cassandra')
parser.add_argument('--cassandra_pass', default='cassandra')
parser.add_argument('--postgres_user')
parser.add_argument('--postgres_pass')
args = parser.parse_args()
main(args)