Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F2631292
cassandra vs postgres benchmarker
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
akosiaris
Sep 24 2015, 3:09 PM
2015-09-24 15:09:27 (UTC+0)
Size
8 KB
Referenced Files
None
Subscribers
None
cassandra vs postgres benchmarker
View Options
Postgres table
==============
gis=# \d tiles
Table "public.tiles"
Column | Type | Modifiers
--------+--------+-----------
zoom | bigint |
block | bigint |
idx | bigint |
tile | bytea |
Indexes:
"tiles_zoom_block_idx_idx" UNIQUE, btree (zoom, block, idx)
Cassandra table
===============
cqlsh:v2> describe tiles;
CREATE TABLE v2.tiles (
zoom int,
block int,
idx int,
tile blob,
PRIMARY KEY (zoom, block, idx)
) WITH CLUSTERING ORDER BY (block ASC, idx ASC)
AND bloom_filter_fp_chance = 0.01
AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
AND comment = ''
AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'}
AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND dclocal_read_repair_chance = 0.1
AND default_time_to_live = 0
AND gc_grace_seconds = 864000
AND max_index_interval = 2048
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair_chance = 0.0
AND speculative_retry = '99.0PERCENTILE';
replication factor for the keyspace is 4 so
nodetool status v2
Datacenter: codfw
=================
Status=Up/Down
|/ State=Normal/Leaving/Joining/Moving
-- Address Load Tokens Owns (effective) Host ID Rack
UN 10.192.0.128 122.99 GB 256 100.0% e33a1800-96d9-4a6d-86d3-e1d270026d4a rack1
UN 10.192.0.129 124.58 GB 256 100.0% f74287ed-8286-459e-956c-e0aed02184c4 rack1
UN 10.192.16.34 110.34 GB 256 100.0% 3e5ff43e-df80-48fe-9302-2a28b1127095 rack1
UN 10.192.16.35 114.19 GB 256 100.0% 1af61427-f70e-4653-bccd-71735e5f78da rack1
Code
====
import psycopg2
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import random
from multiprocessing import Process, Value, cpu_count
import time
import os
import argparse
WARMUPS = 1
NUMBER_OF_QUERIES_PER_PROC = 25000
HOSTS = ['maps-test2001.codfw.wmnet',
'maps-test2002.codfw.wmnet',
'maps-test2003.codfw.wmnet',
'maps-test2004.codfw.wmnet']
def benchmark_cassandra_worker(host, username, password, queries, ret):
# Setup cassandra
auth_provider = PlainTextAuthProvider(
username=username, password=password)
cluster = Cluster([host], auth_provider=auth_provider)
session = cluster.connect('v2')
statement = session.prepare('SELECT zoom, block, idx, tile FROM tiles \
WHERE zoom=? \
AND block=? \
AND idx=?')
start_time = time.time()
for query in queries:
session.execute(statement,
(query['zoom'], query['block'], query['idx']))
end_time = time.time()
duration = end_time - start_time
qps = len(queries) / duration
ret.value = qps
return qps
def benchmark_postgresql_worker(host, username, password, queries, ret):
# Setup postgresql
conn = psycopg2.connect('host=%s dbname=gis user=%s password=%s' %
(host, username, password))
cur = conn.cursor()
start_time = time.time()
sql = 'SELECT zoom, block, idx, tile FROM tiles \
WHERE zoom=%(zoom)s \
AND block=%(block)s \
AND idx=%(idx)s'
for query in queries:
cur.execute(sql, query)
end_time = time.time()
cur.close()
conn.close()
duration = end_time - start_time
qps = len(queries) / duration
ret.value = qps
return qps
def benchmark(func, username, password, queries, nr_procs):
print 'Benchmarking %s' % func.func_name
# Benchmarking cassandra
processes = []
for i in range(nr_procs):
host = HOSTS[i % len(HOSTS)]
start_query = i * len(queries) / nr_procs
end_query = ( i + 1 ) * len(queries) / nr_procs - 1
v = Value('d', 0)
p = Process(target=func,
args=(host, username, password, queries[start_query:end_query], v))
p.start()
processes.append({'p': p, 'v': v})
# Let's wait for everyone to finish
for proc in processes:
proc['p'].join()
qps = 0
for proc in processes:
qps = qps + proc['v'].value
print 'Benchmarking %s: done' % func.func_name
return qps
def main(args):
for nr_procs in range(args.min_procs, args.max_procs + 1):
nr_queries = NUMBER_OF_QUERIES_PER_PROC * nr_procs
queries = []
# First generate all our queries
print 'Creating queries'
if args.mode == 'precomputed':
with open('tiles.csv', 'r') as f:
for i in range(1, nr_queries):
line = f.readline()
elems = line.split(',')
elems = [int(i) for i in elems]
# And let's splice
zoom, block, idx = elems
query = {
'zoom': zoom,
'block': block,
'idx': idx,
}
queries.append(query)
else:
# Below are zoom levels and mins, maxes for block and idx.
# zoom | min_block | max_block | min_idx | max_idx
limits = (
(0, 0, 0, 0, 1),
(1, 0, 0, 0, 3),
(2, 0, 0, 0, 15),
(3, 0, 0, 0, 61),
(4, 0, 0, 1, 247),
(5, 0, 0, 5, 1012),
(6, 0, 0, 21, 4081),
(7, 0, 0, 85, 16326),
(8, 0, 1, 341, 65312),
(9, 0, 7, 1365, 262143),
(10, 0, 31, 5461, 1048575),
(11, 0, 127, 21845, 4194303),
(12, 2, 511, 87381, 16777215),
(13, 10, 2047, 349525, 67108863),
(14, 42, 8191, 1398101, 268435455),
(15, 170, 32767, 5592405, 1073741823),
(16, 682, 65535, 22369621, 2147483647),
)
for i in range(1, nr_queries):
z = random.choice(limits)
query = {
'zoom': z[0],
'block': int(round(random.uniform(z[1], z[2]))) if z[1] != z[2] else 0,
'idx': int(round(random.uniform(z[3], z[4]))),
}
queries.append(query)
print 'Queries created'
cassandra_qps_list = []
postgresql_qps_list = []
# Repeat the exact same test random multiple times per datastore to
# statistically remove the effects of pagecache
for worker in (
(
benchmark_cassandra_worker,
args.cassandra_user,
args.cassandra_pass,
cassandra_qps_list,
),
(
benchmark_postgresql_worker,
args.postgres_user,
args.postgres_pass,
postgresql_qps_list,
)):
for i in range(WARMUPS + 1):
qps = benchmark(worker[0], worker[1], worker[2], queries, nr_procs)
worker[3].append(qps)
cassandra_qps = sum(cassandra_qps_list)/len(cassandra_qps_list)
postgresql_qps = sum(postgresql_qps_list)/len(postgresql_qps_list)
print 'Thread Number: %s' % nr_procs
print 'Cassandra got: %s QPS' % cassandra_qps
print 'PostgreSQL got: %s QPS' % postgresql_qps
with open('results.csv', 'a') as f:
# Yes writing CSV without using the CSV module. Could obviously be
# done better but this is too simple and cheap to really care
f.write('%s,%s,%s\n' % (nr_procs, cassandra_qps, postgresql_qps))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Benchmark maps backends')
parser.add_argument('mode', choices=['precomputed', 'random'])
parser.add_argument('--min_procs', type=int, default=1)
parser.add_argument('--max_procs', type=int, default=1)
parser.add_argument('--cassandra_user', default='cassandra')
parser.add_argument('--cassandra_pass', default='cassandra')
parser.add_argument('--postgres_user')
parser.add_argument('--postgres_pass')
args = parser.parse_args()
main(args)
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2644392
Default Alt Text
cassandra vs postgres benchmarker (8 KB)
Attached To
Mode
P1995 cassandra vs postgres benchmarker
Attached
Detach File
Event Timeline
Log In to Comment