cassandra vs postgres benchmarker
No One
Actions

Authored By

	akosiaris
	Sep 24 2015, 3:09 PM

Size

8 KB

Referenced Files

None

Subscribers

None

cassandra vs postgres benchmarker
View Options

	Postgres table
	==============

	gis=# \d tiles
	Table "public.tiles"
	Column \| Type \| Modifiers
	--------+--------+-----------
	zoom \| bigint \|
	block \| bigint \|
	idx \| bigint \|
	tile \| bytea \|
	Indexes:
	"tiles_zoom_block_idx_idx" UNIQUE, btree (zoom, block, idx)

	Cassandra table
	===============

	cqlsh:v2> describe tiles;

	CREATE TABLE v2.tiles (
	zoom int,
	block int,
	idx int,
	tile blob,
	PRIMARY KEY (zoom, block, idx)
	) WITH CLUSTERING ORDER BY (block ASC, idx ASC)
	AND bloom_filter_fp_chance = 0.01
	AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
	AND comment = ''
	AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'}
	AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
	AND dclocal_read_repair_chance = 0.1
	AND default_time_to_live = 0
	AND gc_grace_seconds = 864000
	AND max_index_interval = 2048
	AND memtable_flush_period_in_ms = 0
	AND min_index_interval = 128
	AND read_repair_chance = 0.0
	AND speculative_retry = '99.0PERCENTILE';

	replication factor for the keyspace is 4 so

	nodetool status v2
	Datacenter: codfw
	=================
	Status=Up/Down
	\|/ State=Normal/Leaving/Joining/Moving
	-- Address Load Tokens Owns (effective) Host ID Rack
	UN 10.192.0.128 122.99 GB 256 100.0% e33a1800-96d9-4a6d-86d3-e1d270026d4a rack1
	UN 10.192.0.129 124.58 GB 256 100.0% f74287ed-8286-459e-956c-e0aed02184c4 rack1
	UN 10.192.16.34 110.34 GB 256 100.0% 3e5ff43e-df80-48fe-9302-2a28b1127095 rack1
	UN 10.192.16.35 114.19 GB 256 100.0% 1af61427-f70e-4653-bccd-71735e5f78da rack1

	Code
	====
	import psycopg2
	from cassandra.cluster import Cluster
	from cassandra.auth import PlainTextAuthProvider
	import random
	from multiprocessing import Process, Value, cpu_count
	import time
	import os
	import argparse

	WARMUPS = 1
	NUMBER_OF_QUERIES_PER_PROC = 25000
	HOSTS = ['maps-test2001.codfw.wmnet',
	'maps-test2002.codfw.wmnet',
	'maps-test2003.codfw.wmnet',
	'maps-test2004.codfw.wmnet']


	def benchmark_cassandra_worker(host, username, password, queries, ret):
	# Setup cassandra
	auth_provider = PlainTextAuthProvider(
	username=username, password=password)
	cluster = Cluster([host], auth_provider=auth_provider)
	session = cluster.connect('v2')
	statement = session.prepare('SELECT zoom, block, idx, tile FROM tiles \
	WHERE zoom=? \
	AND block=? \
	AND idx=?')
	start_time = time.time()
	for query in queries:
	session.execute(statement,
	(query['zoom'], query['block'], query['idx']))
	end_time = time.time()
	duration = end_time - start_time
	qps = len(queries) / duration
	ret.value = qps
	return qps


	def benchmark_postgresql_worker(host, username, password, queries, ret):
	# Setup postgresql
	conn = psycopg2.connect('host=%s dbname=gis user=%s password=%s' %
	(host, username, password))
	cur = conn.cursor()

	start_time = time.time()
	sql = 'SELECT zoom, block, idx, tile FROM tiles \
	WHERE zoom=%(zoom)s \
	AND block=%(block)s \
	AND idx=%(idx)s'
	for query in queries:
	cur.execute(sql, query)
	end_time = time.time()
	cur.close()
	conn.close()
	duration = end_time - start_time
	qps = len(queries) / duration
	ret.value = qps
	return qps


	def benchmark(func, username, password, queries, nr_procs):
	print 'Benchmarking %s' % func.func_name
	# Benchmarking cassandra
	processes = []
	for i in range(nr_procs):
	host = HOSTS[i % len(HOSTS)]
	start_query = i * len(queries) / nr_procs
	end_query = ( i + 1 ) * len(queries) / nr_procs - 1
	v = Value('d', 0)
	p = Process(target=func,
	args=(host, username, password, queries[start_query:end_query], v))
	p.start()
	processes.append({'p': p, 'v': v})

	# Let's wait for everyone to finish
	for proc in processes:
	proc['p'].join()

	qps = 0
	for proc in processes:
	qps = qps + proc['v'].value
	print 'Benchmarking %s: done' % func.func_name
	return qps


	def main(args):
	for nr_procs in range(args.min_procs, args.max_procs + 1):
	nr_queries = NUMBER_OF_QUERIES_PER_PROC * nr_procs
	queries = []
	# First generate all our queries
	print 'Creating queries'
	if args.mode == 'precomputed':
	with open('tiles.csv', 'r') as f:
	for i in range(1, nr_queries):
	line = f.readline()
	elems = line.split(',')
	elems = [int(i) for i in elems]
	# And let's splice
	zoom, block, idx = elems
	query = {
	'zoom': zoom,
	'block': block,
	'idx': idx,
	}
	queries.append(query)
	else:
	# Below are zoom levels and mins, maxes for block and idx.
	# zoom \| min_block \| max_block \| min_idx \| max_idx
	limits = (
	(0, 0, 0, 0, 1),
	(1, 0, 0, 0, 3),
	(2, 0, 0, 0, 15),
	(3, 0, 0, 0, 61),
	(4, 0, 0, 1, 247),
	(5, 0, 0, 5, 1012),
	(6, 0, 0, 21, 4081),
	(7, 0, 0, 85, 16326),
	(8, 0, 1, 341, 65312),
	(9, 0, 7, 1365, 262143),
	(10, 0, 31, 5461, 1048575),
	(11, 0, 127, 21845, 4194303),
	(12, 2, 511, 87381, 16777215),
	(13, 10, 2047, 349525, 67108863),
	(14, 42, 8191, 1398101, 268435455),
	(15, 170, 32767, 5592405, 1073741823),
	(16, 682, 65535, 22369621, 2147483647),
	)
	for i in range(1, nr_queries):
	z = random.choice(limits)
	query = {
	'zoom': z[0],
	'block': int(round(random.uniform(z[1], z[2]))) if z[1] != z[2] else 0,
	'idx': int(round(random.uniform(z[3], z[4]))),
	}
	queries.append(query)
	print 'Queries created'

	cassandra_qps_list = []
	postgresql_qps_list = []
	# Repeat the exact same test random multiple times per datastore to
	# statistically remove the effects of pagecache
	for worker in (
	(
	benchmark_cassandra_worker,
	args.cassandra_user,
	args.cassandra_pass,
	cassandra_qps_list,
	),
	(
	benchmark_postgresql_worker,
	args.postgres_user,
	args.postgres_pass,
	postgresql_qps_list,
	)):
	for i in range(WARMUPS + 1):
	qps = benchmark(worker[0], worker[1], worker[2], queries, nr_procs)
	worker[3].append(qps)
	cassandra_qps = sum(cassandra_qps_list)/len(cassandra_qps_list)
	postgresql_qps = sum(postgresql_qps_list)/len(postgresql_qps_list)
	print 'Thread Number: %s' % nr_procs
	print 'Cassandra got: %s QPS' % cassandra_qps
	print 'PostgreSQL got: %s QPS' % postgresql_qps
	with open('results.csv', 'a') as f:
	# Yes writing CSV without using the CSV module. Could obviously be
	# done better but this is too simple and cheap to really care
	f.write('%s,%s,%s\n' % (nr_procs, cassandra_qps, postgresql_qps))


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Benchmark maps backends')
	parser.add_argument('mode', choices=['precomputed', 'random'])
	parser.add_argument('--min_procs', type=int, default=1)
	parser.add_argument('--max_procs', type=int, default=1)
	parser.add_argument('--cassandra_user', default='cassandra')
	parser.add_argument('--cassandra_pass', default='cassandra')
	parser.add_argument('--postgres_user')
	parser.add_argument('--postgres_pass')
	args = parser.parse_args()
	main(args)

File Metadata

Mime Type: text/plain; charset=utf-8
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 2644392
Default Alt Text: cassandra vs postgres benchmarker (8 KB)

cassandra vs postgres benchmarkerNo OneActions

cassandra vs postgres benchmarkerView Options

File Metadata

Event Timeline

cassandra vs postgres benchmarker
No One
Actions

cassandra vs postgres benchmarker
View Options