Page MenuHomePhabricator

cassandra vs postgres benchmarker

Authored By
akosiaris
Sep 8 2015, 11:41 AM
Size
3 KB
Referenced Files
None
Subscribers
None

cassandra vs postgres benchmarker

import psycopg2
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import SimpleStatement
import random
from multiprocessing import Process, Value
import time
import math
import os
NUMBER_OF_QUERIES = 120000 # 10 million queries
NUMBER_OF_PROCS = 12
HOSTS = ['maps-test2001.codfw.wmnet',
'maps-test2002.codfw.wmnet',
'maps-test2003.codfw.wmnet',
'maps-test2004.codfw.wmnet']
CASSANDRA_USER = os.environ['CASSANDRA_USER']
CASSANDRA_PASS = os.environ['CASSANDRA_PASS']
POSTGRES_USER = os.environ['POSTGRES_USER']
POSTGRES_PASS = os.environ['POSTGRES_PASS']
def benchmark_cassandra_worker(host, queries, ret):
# Setup cassandra
auth_provider = PlainTextAuthProvider(
username=CASSANDRA_USER, password=CASSANDRA_PASS)
cluster = Cluster([host], auth_provider=auth_provider)
session = cluster.connect('v2')
start_time = time.time()
for query in queries:
statement = SimpleStatement(query)
session.execute(statement)
end_time = time.time()
duration = end_time - start_time
qps = len(queries) / duration
ret.value = qps
return qps
def benchmark_postgresql_worker(host, queries, ret):
# Setup postgresql
conn = psycopg2.connect('host=%s dbname=gis user=%s password=%s' % (host, POSTGRES_USER, POSTGRES_PASS))
cur = conn.cursor()
start_time = time.time()
for query in queries:
cur.execute(query)
end_time = time.time()
cur.close()
conn.close()
duration = end_time - start_time
qps = len(queries) / duration
ret.value = qps
return qps
def benchmark(func):
# Benchmarking cassandra
processes = []
for i in range(NUMBER_OF_PROCS):
host = HOSTS[i % len(HOSTS)]
start_query = i*len(queries)/NUMBER_OF_PROCS
end_query = (i+1)*len(queries)/NUMBER_OF_PROCS - 1
v = Value('d', 0)
p = Process(target=func,
args=(host, queries[start_query:end_query], v))
p.start()
processes.append({'p': p, 'v': v})
# Let's wait for everyone to finish
for proc in processes:
proc['p'].join()
qps = 0
for proc in processes:
qps = qps + proc['v'].value
return qps
if __name__ == '__main__':
queries = []
# First generate all our queries
print "Creating queries"
for i in range(1, NUMBER_OF_QUERIES):
# We give every max some extra space so as to have queries that return
# nothing. idx is an exception as it set to the max allowed
query = {
'zoom': random.randrange(0, 20),
'block': random.randrange(0, 70000),
'idx': random.randrange(0, 2147483647),
}
query = 'SELECT zoom, block, idx, tile FROM tiles \
WHERE zoom=%(zoom)s \
AND block=%(block)s \
AND idx=%(idx)s' % query
queries.append(query)
print "Queries created"
print "Benchmarking cassandra"
cassandra_qps = benchmark(benchmark_cassandra_worker)
print "Benchmarking cassandra: done"
print "Benchmarking postgres"
postgresql_qps = benchmark(benchmark_postgresql_worker)
print "Benchmarking postgres: done"
print 'Cassandra got: %s QPS' % cassandra_qps
print 'PostgreSQL got: %s QPS' % postgresql_qps

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2570519
Default Alt Text
cassandra vs postgres benchmarker (3 KB)

Event Timeline